diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000000000..1f3393bbbd87c13 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,53 @@ +--- +name: Bug report +about: Create a report to help us improve OpenCilk +title: '' +labels: bug +assignees: '' + +--- + +### Describe the bug + +A clear and concise description of the bug. + +### Expected behavior + +What you expected to happen. + +### OpenCilk version + +- Release version: _[e.g., 1.0]_ +- Distribution method: _[e.g., `.sh` archive, Docker image, ...]_ + +_-OR-_ + +Built from source: +- `opencilk-project`: `branch` commit# (or tag) +- `cheetah`: `branch` commit# (or tag) +- `productivity-tools`: `branch` commit# (or tag) +- `infrastructure`: `branch` commit# (or tag) _(if applicable)_ + +### System information + +- OS: distribution, version _[e.g., Ubuntu 20.04]_ +- CPU: _[e.g., Intel Xeon Platinum 8260]_ + +### Steps to reproduce (include relevant output) + +1. _[E.g., clone repo X]_ + +2. _[E.g., build with parameters XYZ]_ + + Pass the `-v` flag to OpenCilk `clang`/`clang++` to show verbose compilation + commands and output. + +3. _[E.g., compiler crash output, runtime failure...]_ + +### Working example code + +If applicable, include a working code example which triggers the bug. + +### Additional comments + +Add any other comments about the issue here. diff --git a/MIT_LICENSE.TXT b/MIT_LICENSE.TXT new file mode 100644 index 000000000000000..902549ed26ab284 --- /dev/null +++ b/MIT_LICENSE.TXT @@ -0,0 +1,29 @@ +=============================================================== +Modifications to the LLVM Project for OpenCilk are licensed under the +MIT License with the OpenCilk Addendum: +=============================================================== + +Copyright (c) 2020 Massachusetts Institute of Technology + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal with the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +---- OpenCilk Addendum to the MIT License ---- + +As an alternative to distributing the Software under this license, you may +distribute the Software under the LLVM license. diff --git a/README.md b/README.md index a9b29ecbc1a3a47..bf0f5bb1b6fb0fd 100644 --- a/README.md +++ b/README.md @@ -42,3 +42,88 @@ chat](https://discord.gg/xS7Z362), The LLVM project has adopted a [code of conduct](https://llvm.org/docs/CodeOfConduct.html) for participants to all modes of communication within the project. + +## Tapir + +For the Tapir compiler IR, cite either the +[Tapir conference paper][SchardlMoLe17] at ACM PPoPP 2017 conference +paper or the [Tapir journal paper][SchardlMoLe19] in ACM TOPC 2019. + +Tapir conference paper, ACM PPoPP 2017: +> Tao B. Schardl, William S. Moses, and Charles E. Leiserson. 2017. +> Tapir: Embedding Fork-Join Parallelism into LLVM's Intermediate +> Representation. In Proceedings of the 22nd ACM SIGPLAN Symposium +> on Principles and Practice of Parallel Programming (PPoPP '17). +> 249–265. https://doi.org/10.1145/3018743.3018758 + +BibTeX: +```bibtex +@inproceedings{SchardlMoLe17, +author = {Schardl, Tao B. and Moses, William S. and Leiserson, Charles E.}, +title = {Tapir: Embedding Fork-Join Parallelism into LLVM's Intermediate Representation}, +year = {2017}, +isbn = {9781450344937}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3018743.3018758}, +doi = {10.1145/3018743.3018758}, +booktitle = {Proceedings of the 22nd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, +pages = {249–-265}, +numpages = {17}, +keywords = {control-flow graph, multicore, tapir, openmp, fork-join parallelism, cilk, optimization, serial semantics, llvm, par- allel computing, compiling}, +location = {Austin, Texas, USA}, +series = {PPoPP '17} +} +``` + +Journal article about Tapir, ACM TOPC 2019: +> Tao B. Schardl, William S. Moses, and Charles E. Leiserson. 2019. +> Tapir: Embedding Recursive Fork-join Parallelism into LLVM’s +> Intermediate Representation. ACM Transactions on Parallel Computing 6, +> 4, Article 19 (December 2019), 33 pages. https://doi.org/10.1145/3365655 + +BibTeX: +```bibtex +@article{SchardlMoLe19, +author = {Schardl, Tao B. and Moses, William S. and Leiserson, Charles E.}, +title = {Tapir: Embedding Recursive Fork-Join Parallelism into LLVM’s Intermediate Representation}, +year = {2019}, +issue_date = {December 2019}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +volume = {6}, +number = {4}, +issn = {2329-4949}, +url = {https://doi.org/10.1145/3365655}, +doi = {10.1145/3365655}, +journal = {ACM Transactions on Parallel Computing}, +month = {dec}, +articleno = {19}, +numpages = {33}, +keywords = {compiling, fork-join parallelism, Tapir, control-flow graph, optimization, parallel computing, OpenMP, multicore, Cilk, serial-projection property, LLVM} +} +``` + +## Acknowledgments + +OpenCilk is supported in part by the National Science Foundation, +under grant number CCRI-1925609, and in part by the +[USAF-MIT AI Accelerator](https://aia.mit.edu/), which is sponsored by the +United States Air Force Research Laboratory under Cooperative Agreement +Number FA8750-19-2-1000. + +Any opinions, findings, and conclusions or recommendations expressed +in this material are those of the author(s) and should not be +interpreted as representing the official policies or views, either +expressed or implied, of the United states Air Force, the +U.S. Government, or the National Science Foundation. The +U.S. Government is authorized to reproduce and distribute reprints for +Government purposes notwithstanding any copyright notation herein. + +[SchardlLe23]: https://dl.acm.org/doi/10.1145/3572848.3577509 +[SchardlMoLe17]: https://dl.acm.org/doi/10.1145/3155284.3018758 +[SchardlMoLe19]: https://dl.acm.org/doi/10.1145/3365655 + +## TODO: + +Add text and acknowledgements for Kitsune diff --git a/README_LLVM.md b/README_LLVM.md new file mode 100644 index 000000000000000..eb8d624d75cecd8 --- /dev/null +++ b/README_LLVM.md @@ -0,0 +1,39 @@ +# The LLVM Compiler Infrastructure + +Welcome to the LLVM project! + +This repository contains the source code for LLVM, a toolkit for the +construction of highly optimized compilers, optimizers, and run-time +environments. + +The LLVM project has multiple components. The core of the project is +itself called "LLVM". This contains all of the tools, libraries, and header +files needed to process intermediate representations and convert them into +object files. Tools include an assembler, disassembler, bitcode analyzer, and +bitcode optimizer. + +C-like languages use the [Clang](http://clang.llvm.org/) frontend. This +component compiles C, C++, Objective-C, and Objective-C++ code into LLVM bitcode +-- and from there into object files, using LLVM. + +Other components include: +the [libc++ C++ standard library](https://libcxx.llvm.org), +the [LLD linker](https://lld.llvm.org), and more. + +## Getting the Source Code and Building LLVM + +Consult the +[Getting Started with LLVM](https://llvm.org/docs/GettingStarted.html#getting-the-source-code-and-building-llvm) +page for information on building and running LLVM. + +For information on how to contribute to the LLVM project, please take a look at +the [Contributing to LLVM](https://llvm.org/docs/Contributing.html) guide. + +## Getting in touch + +Join the [LLVM Discourse forums](https://discourse.llvm.org/), [Discord +chat](https://discord.gg/xS7Z362), or #llvm IRC channel on +[OFTC](https://oftc.net/). + +The LLVM project has adopted a [code of conduct](https://llvm.org/docs/CodeOfConduct.html) for +participants to all modes of communication within the project. diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index c6496167d3828b9..106f9dee250c19d 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -275,6 +275,9 @@ set(CLANG_DEFAULT_OPENMP_RUNTIME "libomp" CACHE STRING set(CLANG_SYSTEMZ_DEFAULT_ARCH "z10" CACHE STRING "SystemZ Default Arch") +set(CLANG_DEFAULT_TAPIR_RUNTIME "opencilk" CACHE STRING + "Default Tapir runtime used by -ftapir.") + set(CLANG_VENDOR ${PACKAGE_VENDOR} CACHE STRING "Vendor-specific text for showing with version information.") diff --git a/clang/README.md b/clang/README.md new file mode 100644 index 000000000000000..836f2fcd0295f26 --- /dev/null +++ b/clang/README.md @@ -0,0 +1,38 @@ +Cilk-Clang +================================ + +This version of Clang supports the `_Cilk_spawn`, `_Cilk_sync`, and +`_Cilk_for` keywords from Cilk. In particular, this version of Clang +supports the use of _Cilk_spawn before a function call in a statement, +an assignment, or a declaration, as in the following examples: + +``` +_Cilk_spawn foo(n); +``` + +``` +x = _Cilk_spawn foo(n); +``` + +``` +int x = _Cilk_spawn foo(n); +``` + +When spawning a function call, the call arguments and function +arguments are evaluated before the spawn occurs. When spawning an +assignment or declaration, the LHS is also evaluated before the spawn +occurs. + +For convenience, this version of Clang allows `_Cilk_spawn` to spawn an +arbitrary statement, as follows: + +``` +_Cilk_spawn { x = foo(n); } +``` + +Please use this syntax with caution! When spawning an arbitrary +statement, the spawn occurs before the evaluation of any part of the +spawned statement. Furthermore, some statements, such as `goto`, are +not legal to spawn. In the future, we will add checks to catch +illegal uses of `_Cilk_spawn`. + diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 46d0a66d59c3753..f65134a3a7b02c9 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1,4 +1,4 @@ -//==--- Attr.td - attribute definitions -----------------------------------===// +.td - attribute definitions -----------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -4127,6 +4127,10 @@ def LoopHint : Attr { /// boolean: fully unrolls loop if State == Enable. /// expression: unrolls loop 'Value' times. + // FIXME: Probably don't need all the grainsize stuff here since it seems + // that it is only used in Cilk pragmas. + /// expression: coarsens the loop with grainsize 'Value'. + let Spellings = [Pragma<"clang", "loop">, Pragma<"", "unroll">, Pragma<"", "nounroll">, Pragma<"", "unroll_and_jam">, Pragma<"", "nounroll_and_jam">]; @@ -4136,11 +4140,11 @@ def LoopHint : Attr { ["vectorize", "vectorize_width", "interleave", "interleave_count", "unroll", "unroll_count", "unroll_and_jam", "unroll_and_jam_count", "pipeline", "pipeline_initiation_interval", "distribute", - "vectorize_predicate"], + "vectorize_predicate", "grainsize"], ["Vectorize", "VectorizeWidth", "Interleave", "InterleaveCount", "Unroll", "UnrollCount", "UnrollAndJam", "UnrollAndJamCount", "PipelineDisabled", "PipelineInitiationInterval", "Distribute", - "VectorizePredicate"]>, + "VectorizePredicate", "TapirGrainSize"]>, EnumArgument<"State", "LoopHintState", /*is_string=*/false, ["enable", "disable", "numeric", "fixed_width", "scalable_width", "assume_safety", "full"], @@ -4163,6 +4167,7 @@ def LoopHint : Attr { case PipelineInitiationInterval: return "pipeline_initiation_interval"; case Distribute: return "distribute"; case VectorizePredicate: return "vectorize_predicate"; + case TapirGrainsize: return "grainsize"; } llvm_unreachable("Unhandled LoopHint option."); } diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index 12808eb275fa464..1db006c2e489d77 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -313,6 +313,7 @@ VALUE_CODEGENOPT(TimeTraceGranularity, 32, 500) ///< Minimum time granularity (i ///< traced by time profiler CODEGENOPT(UnrollLoops , 1, 0) ///< Control whether loops are unrolled. CODEGENOPT(RerollLoops , 1, 0) ///< Control whether loops are rerolled. +CODEGENOPT(StripmineLoop , 1, 0) ///< Run Tapir loop stripmining. CODEGENOPT(NoUseJumpTables , 1, 0) ///< Set when -fno-jump-tables is enabled. VALUE_CODEGENOPT(UnwindTables, 2, 0) ///< Unwind tables (1) or asynchronous unwind tables (2) CODEGENOPT(LinkBitcodePostopt, 1, 0) ///< Link builtin bitcodes after optimization pipeline. @@ -376,6 +377,9 @@ VALUE_CODEGENOPT(InlineMaxStackSize, 32, UINT_MAX) // Vector functions library to use. ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 3, llvm::driver::VectorLibrary::NoLibrary) +/// Tapir target runtime library to use. +ENUM_CODEGENOPT(TapirTarget, TapirTargetID, 8, TapirTargetID::Last_TapirTargetID) + /// The default TLS model to use. ENUM_CODEGENOPT(DefaultTLSModel, TLSModel, 2, GeneralDynamicTLSModel) diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h index f2a707a8ba8d761..07cdccb73e78fcc 100644 --- a/clang/include/clang/Basic/CodeGenOptions.h +++ b/clang/include/clang/Basic/CodeGenOptions.h @@ -20,6 +20,7 @@ #include "llvm/Frontend/Debug/Options.h" #include "llvm/Frontend/Driver/CodeGenOptions.h" #include "llvm/Support/CodeGen.h" +#include "clang/Basic/Tapir.h" #include "llvm/Support/Regex.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/Instrumentation/AddressSanitizerOptions.h" @@ -451,6 +452,9 @@ class CodeGenOptions : public CodeGenOptionsBase { /// passed on the command line. std::string StackUsageOutput; + // Path to OpenCilk runtime bitcode file. + std::string OpenCilkABIBitcodeFile; + /// Executable and command-line used to create a given CompilerInvocation. /// Most of the time this will be the full -cc1 command. const char *Argv0 = nullptr; diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 3d8240f8357b406..c17e7c42404e6a0 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -339,6 +339,14 @@ def err_drv_invalid_cf_runtime_abi "'objc', 'standalone', 'swift', 'swift-5.0', 'swift-4.2', 'swift-4.1'">; def err_drv_gnustep_objc_runtime_incompatible_binary : Error< "GNUstep Objective-C runtime version %0 incompatible with target binary format">; +def err_drv_cilk_unsupported: Error< + "Cilk not yet supported for this target">; +def err_drv_opencilk_missing_abi_bitcode: Error< + "Cannot find OpenCilk runtime ABI bitcode file: %0">; +def err_drv_opencilk_resource_dir_missing_include: Error< + "No include directory in OpenCilk resource directory: %0">; +def err_drv_opencilk_resource_dir_missing_lib: Error< + "No lib directory in OpenCilk resource directory: %0">; def err_drv_emit_llvm_link : Error< "-emit-llvm cannot be used when linking">; def err_drv_optimization_remark_pattern : Error< diff --git a/clang/include/clang/Basic/Tapir.h b/clang/include/clang/Basic/Tapir.h new file mode 100644 index 000000000000000..0d038af275d28d6 --- /dev/null +++ b/clang/include/clang/Basic/Tapir.h @@ -0,0 +1,24 @@ +//===--- Tapir.h - C Language Family Language Options -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// Defines helper functions for processing flags related to Tapir. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_BASIC_TAPIR_H +#define LLVM_CLANG_BASIC_TAPIR_H + +#include "llvm/Transforms/Tapir/TapirTargetIDs.h" + +namespace clang { + +using TapirTargetID = llvm::TapirTargetID; + +} // end namespace clang + +#endif diff --git a/clang/include/clang/Config/config.h.cmake b/clang/include/clang/Config/config.h.cmake index 27ed69e21562bff..092b4132cb5acdd 100644 --- a/clang/include/clang/Config/config.h.cmake +++ b/clang/include/clang/Config/config.h.cmake @@ -35,6 +35,9 @@ /* Multilib basename for libdir. */ #define CLANG_INSTALL_LIBDIR_BASENAME "${CLANG_INSTALL_LIBDIR_BASENAME}" +/* Default Tapir runtime used by -ftapir. */ +#define CLANG_DEFAULT_TAPIR_RUNTIME "${CLANG_DEFAULT_TAPIR_RUNTIME}" + /* Relative directory for resource files */ #define CLANG_RESOURCE_DIR "${CLANG_RESOURCE_DIR}" diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 15f9ee75492e3f9..86a0a83df5d0035 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3312,6 +3312,16 @@ def fno_knr_functions : Flag<["-"], "fno-knr-functions">, Group, HelpText<"Disable support for K&R C function declarations">, Visibility<[ClangOption, CC1Option, CLOption]>; +def ftapir_EQ : Joined<["-"], "ftapir=">, Group, Flags<[CC1Option]>, + HelpText<"Choose the backend parallel runtime for Tapir instructions">, + Values<"none,serial,cilkplus,cheetah,lambda,omptask,opencilk,qthreads">; +def opencilk_resource_dir_EQ : Joined<["--"], "opencilk-resource-dir=">, + Flags<[NoXarchOption]>, + HelpText<"The directory that holds OpenCilk resource files">; +def opencilk_abi_bitcode_EQ : Joined<["--"], "opencilk-abi-bitcode=">, + Flags<[CC1Option]>, HelpText<"Path to OpenCilk ABI bitcode file">, + MarshallingInfoString>; + def fmudflapth : Flag<["-"], "fmudflapth">, Group; def fmudflap : Flag<["-"], "fmudflap">, Group; def fnested_functions : Flag<["-"], "fnested-functions">, Group; @@ -3965,6 +3975,10 @@ def Wlarge_by_value_copy_EQ : Joined<["-"], "Wlarge-by-value-copy=">, Visibility<[ClangOption, CC1Option]>, MarshallingInfoInt>; +def fstripmine : Flag<["-"], "fstripmine">, Group, + HelpText<"Enable the Tapir loop stripmining passes">; +def fno_stripmine : Flag<["-"], "fno-stripmine">, Group; + // These "special" warning flags are effectively processed as f_Group flags by the driver: // Just silence warnings about -Wlarger-than for now. def Wlarger_than_EQ : Joined<["-"], "Wlarger-than=">, Group; @@ -7240,6 +7254,12 @@ def vectorize_loops : Flag<["-"], "vectorize-loops">, def vectorize_slp : Flag<["-"], "vectorize-slp">, HelpText<"Run the SLP vectorization passes">, MarshallingInfoFlag>; +def stripmine_loops : Flag<["-"], "stripmine-loops">, + HelpText<"Run the Tapir Loop stripmining passes">, + MarshallingInfoFlag>; +def dependent_lib : Joined<["--"], "dependent-lib=">, + HelpText<"Add dependent library">, + MarshallingInfoStringVector>; def linker_option : Joined<["--"], "linker-option=">, HelpText<"Add linker option">, MarshallingInfoStringVector>; diff --git a/clang/include/clang/Driver/Tapir.h b/clang/include/clang/Driver/Tapir.h new file mode 100644 index 000000000000000..a512a0ef830a880 --- /dev/null +++ b/clang/include/clang/Driver/Tapir.h @@ -0,0 +1,31 @@ +//===--- Tapir.h - C Language Family Language Options -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// Defines helper functions for processing flags related to Tapir. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_DRIVER_TAPIR_H +#define LLVM_CLANG_DRIVER_TAPIR_H + +#include "clang/Basic/Tapir.h" + +namespace llvm { +namespace opt { + class ArgList; +} +} + +namespace clang { + +TapirTargetID parseTapirTarget(const llvm::opt::ArgList &Args); +std::optional serializeTapirTarget(TapirTargetID Target); + +} // end namespace clang + +#endif diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h index ece1384d5d3c020..9d3d148fd96013e 100644 --- a/clang/include/clang/Driver/ToolChain.h +++ b/clang/include/clang/Driver/ToolChain.h @@ -816,6 +816,39 @@ class ToolChain { } return TT; } + + /// Check the specified OpenCilk resource directory is valid. + virtual void AddOpenCilkIncludeDir(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) const; + + /// Get the OpenCilk library path if it exists. + virtual path_list + getOpenCilkRuntimePaths(const llvm::opt::ArgList &Args) const; + + virtual std::string getOpenCilkBCBasename(const llvm::opt::ArgList &Args, + StringRef Component, + bool AddArch) const; + + virtual std::optional + getOpenCilkBC(const llvm::opt::ArgList &Args, StringRef Component) const; + + virtual std::string getOpenCilkRTBasename(const llvm::opt::ArgList &Args, + StringRef Component, FileType Type, + bool AddArch) const; + + virtual std::string getOpenCilkRT(const llvm::opt::ArgList &Args, + StringRef Component, FileType Type) const; + + /// AddOpenCilkBitcodeABI - Add compiler arguments for linking against the + /// OpenCilk runtime ABI bitcode file. + virtual void AddOpenCilkABIBitcode(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + bool IsLTO = false) const; + + /// AddTapirRuntimeLibArgs - Add the specific linker arguments to use for the + /// given Tapir runtime library type. + virtual void AddTapirRuntimeLibArgs(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) const; }; /// Set a ToolChain's effective triple. Reset it when the registration object diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 5e57b5e8bc8f159..0a8d5db5cdbc41a 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11911,6 +11911,7 @@ GCCTypeClass EvaluateBuiltinClassifyType(QualType T, return EvaluateBuiltinClassifyType( CanTy->castAs()->getValueType(), LangOpts); + case Type::BlockPointer: case Type::Vector: case Type::ExtVector: return GCCTypeClass::Vector; @@ -16215,6 +16216,8 @@ bool Expr::EvaluateAsInitializer(APValue &Value, const ASTContext &Ctx, llvm_unreachable("Unhandled cleanup; missing full expression marker?"); } + SourceLocation DeclLoc = VD->getLocation(); + QualType DeclTy = VD->getType(); return CheckConstantExpression(Info, DeclLoc, DeclTy, Value, ConstantExprKind::Normal) && CheckMemoryLeaks(Info); diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index 526f7f30a386185..0a2e60c4d825736 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -116,6 +116,8 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { return false; } + bool hasSjLjLowering() const override { return true; } + void setArchFeatures(); void getTargetDefinesARMV81A(const LangOptions &Opts, diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index e765bbf637a661d..4511249bbeb0186 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -85,6 +85,7 @@ #include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/JumpThreading.h" +#include "llvm/Transforms/Tapir/TapirToTarget.h" #include "llvm/Transforms/Utils/Debugify.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include @@ -289,6 +290,51 @@ static bool asanUseGlobalsGC(const Triple &T, const CodeGenOptions &CGOpts) { return false; } +static TargetLibraryInfoImpl *createTLII(llvm::Triple &TargetTriple, + const CodeGenOptions &CodeGenOpts) { + TargetLibraryInfoImpl *TLII = new TargetLibraryInfoImpl(TargetTriple); + + switch (CodeGenOpts.getVecLib()) { + case CodeGenOptions::Accelerate: + TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::Accelerate, + TargetTriple); + break; + case CodeGenOptions::LIBMVEC: + TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::LIBMVEC_X86, + TargetTriple); + break; + case CodeGenOptions::MASSV: + TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::MASSV, + TargetTriple); + break; + case CodeGenOptions::SVML: + TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::SVML, + TargetTriple); + break; + case CodeGenOptions::SLEEF: + TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::SLEEFGNUABI, + TargetTriple); + break; + case CodeGenOptions::Darwin_libsystem_m: + TLII->addVectorizableFunctionsFromVecLib( + TargetLibraryInfoImpl::DarwinLibSystemM, TargetTriple); + break; + case CodeGenOptions::ArmPL: + TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::ArmPL, + TargetTriple); + break; + default: + break; + } + + TLII->setTapirTarget(CodeGenOpts.getTapirTarget()); + TLII->setTapirTargetOptions( + std::make_unique(CodeGenOpts.OpenCilkABIBitcodeFile)); + TLII->addTapirTargetLibraryFunctions(); + + return TLII; +} + static std::optional getCodeModel(const CodeGenOptions &CodeGenOpts) { unsigned CodeModel = llvm::StringSwitch(CodeGenOpts.CodeModel) @@ -852,6 +898,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline( PTO.LoopInterleaving = CodeGenOpts.UnrollLoops; PTO.LoopVectorization = CodeGenOpts.VectorizeLoop; PTO.SLPVectorization = CodeGenOpts.VectorizeSLP; + PTO.LoopStripmine = CodeGenOpts.StripmineLoop; PTO.MergeFunctions = CodeGenOpts.MergeFunctions; // Only enable CGProfilePass when using integrated assembler, since // non-integrated assemblers don't recognize .cgprofile section. @@ -1034,7 +1081,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline( } else if (PrepareForLTO) { MPM.addPass(PB.buildLTOPreLinkDefaultPipeline(Level)); } else { - MPM.addPass(PB.buildPerModuleDefaultPipeline(Level)); + MPM.addPass(PB.buildPerModuleDefaultPipeline(Level), + /* LTOPreLink */ false, TLII->hasTapirTarget()); } } @@ -1258,6 +1306,8 @@ static void runThinLTOBackend( Conf.RemarksFormat = CGOpts.OptRecordFormat; Conf.SplitDwarfFile = CGOpts.SplitDwarfFile; Conf.SplitDwarfOutput = CGOpts.SplitDwarfOutput; + Conf.TapirTarget = CGOpts.getTapirTarget(); + Conf.OpenCilkABIBitcodeFile = CGOpts.OpenCilkABIBitcodeFile; switch (Action) { case Backend_EmitNothing: Conf.PreCodeGenModuleHook = [](size_t Task, const llvm::Module &Mod) { diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 5639239359ab827..4c70ff557ea1e58 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -207,12 +207,25 @@ static Address CheckAtomicAlignment(CodeGenFunction &CGF, const CallExpr *E) { return Ptr; } +/// Utility function to start a detach if necessary. +static void MaybeDetach(CodeGenFunction *CGF, + CodeGenFunction::IsSpawnedScope &SpawnedScp) { + if (SpawnedScp.OldScopeIsSpawned()) { + SpawnedScp.RestoreOldScope(); + assert(CGF->CurDetachScope && + "A call was spawned, but no detach scope was pushed."); + if (!CGF->CurDetachScope->IsDetachStarted()) + CGF->CurDetachScope->StartDetach(); + } +} + /// Utility to insert an atomic instruction based on Intrinsic::ID /// and the expression node. static Value *MakeBinaryAtomicValue( CodeGenFunction &CGF, llvm::AtomicRMWInst::BinOp Kind, const CallExpr *E, AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) { + CodeGenFunction::IsSpawnedScope SpawnedScp(&CGF); QualType T = E->getType(); assert(E->getArg(0)->getType()->isPointerType()); assert(CGF.getContext().hasSameUnqualifiedType(T, @@ -234,6 +247,7 @@ static Value *MakeBinaryAtomicValue( } static Value *EmitNontemporalStore(CodeGenFunction &CGF, const CallExpr *E) { + CodeGenFunction::IsSpawnedScope SpawnedScp(&CGF); Value *Val = CGF.EmitScalarExpr(E->getArg(0)); Address Addr = CGF.EmitPointerWithAlignment(E->getArg(1)); @@ -266,6 +280,7 @@ static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF, const CallExpr *E, Instruction::BinaryOps Op, bool Invert = false) { + CodeGenFunction::IsSpawnedScope SpawnedScp(&CGF); QualType T = E->getType(); assert(E->getArg(0)->getType()->isPointerType()); assert(CGF.getContext().hasSameUnqualifiedType(T, @@ -281,6 +296,7 @@ static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF, llvm::Type *ValueType = Val->getType(); Val = EmitToInt(CGF, Val, T, IntType); + MaybeDetach(&CGF, SpawnedScp); llvm::Value *Result = CGF.Builder.CreateAtomicRMW( Kind, DestAddr, Val, llvm::AtomicOrdering::SequentiallyConsistent); Result = CGF.Builder.CreateBinOp(Op, Result, Val); @@ -308,6 +324,7 @@ static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF, /// invoke the function EmitAtomicCmpXchgForMSIntrin. static Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E, bool ReturnBool) { + CodeGenFunction::IsSpawnedScope SpawnedScp(&CGF); QualType T = ReturnBool ? E->getArg(1)->getType() : E->getType(); Address DestAddr = CheckAtomicAlignment(CGF, E); @@ -494,14 +511,17 @@ static Value *EmitISOVolatileStore(CodeGenFunction &CGF, const CallExpr *E) { static Value *emitUnaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E, unsigned IntrinsicID, unsigned ConstrainedIntrinsicID) { + CodeGenFunction::IsSpawnedScope SpawnedScp(&CGF); llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E); if (CGF.Builder.getIsFPConstrained()) { Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType()); + MaybeDetach(&CGF, SpawnedScp); return CGF.Builder.CreateConstrainedFPCall(F, { Src0 }); } else { Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); + MaybeDetach(&CGF, SpawnedScp); return CGF.Builder.CreateCall(F, Src0); } } @@ -511,15 +531,18 @@ static Value *emitUnaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, static Value *emitBinaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E, unsigned IntrinsicID, unsigned ConstrainedIntrinsicID) { + CodeGenFunction::IsSpawnedScope SpawnedScp(&CGF); llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E); if (CGF.Builder.getIsFPConstrained()) { Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType()); + MaybeDetach(&CGF, SpawnedScp); return CGF.Builder.CreateConstrainedFPCall(F, { Src0, Src1 }); } else { Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); + MaybeDetach(&CGF, SpawnedScp); return CGF.Builder.CreateCall(F, { Src0, Src1 }); } } @@ -548,6 +571,7 @@ static Value *emitBinaryExpMaybeConstrainedFPBuiltin( static Value *emitTernaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, const CallExpr *E, unsigned IntrinsicID, unsigned ConstrainedIntrinsicID) { + CodeGenFunction::IsSpawnedScope SpawnedScp(&CGF); llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2)); @@ -555,9 +579,11 @@ static Value *emitTernaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E); if (CGF.Builder.getIsFPConstrained()) { Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType()); + MaybeDetach(&CGF, SpawnedScp); return CGF.Builder.CreateConstrainedFPCall(F, { Src0, Src1, Src2 }); } else { Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); + MaybeDetach(&CGF, SpawnedScp); return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 }); } } @@ -593,6 +619,7 @@ Value *emitBuiltinWithOneOverloadedType(CodeGenFunction &CGF, const CallExpr *E, for (unsigned I = 0; I < N; ++I) Args.push_back(CGF.EmitScalarExpr(E->getArg(I))); Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Args[0]->getType()); + MaybeDetach(&CGF, SpawnedScp); return CGF.Builder.CreateCall(F, Args, Name); } @@ -600,10 +627,12 @@ Value *emitBuiltinWithOneOverloadedType(CodeGenFunction &CGF, const CallExpr *E, static Value *emitFPIntBuiltin(CodeGenFunction &CGF, const CallExpr *E, unsigned IntrinsicID) { + CodeGenFunction::IsSpawnedScope SpawnedScp(&CGF); llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); + MaybeDetach(&CGF, SpawnedScp); return CGF.Builder.CreateCall(F, {Src0, Src1}); } @@ -612,6 +641,7 @@ static Value * emitMaybeConstrainedFPToIntRoundBuiltin(CodeGenFunction &CGF, const CallExpr *E, unsigned IntrinsicID, unsigned ConstrainedIntrinsicID) { + CodeGenFunction::IsSpawnedScope SpawnedScp(&CGF); llvm::Type *ResultType = CGF.ConvertType(E->getType()); llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); @@ -619,10 +649,12 @@ emitMaybeConstrainedFPToIntRoundBuiltin(CodeGenFunction &CGF, const CallExpr *E, CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E); Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, {ResultType, Src0->getType()}); + MaybeDetach(&CGF, SpawnedScp); return CGF.Builder.CreateConstrainedFPCall(F, {Src0}); } else { Function *F = CGF.CGM.getIntrinsic(IntrinsicID, {ResultType, Src0->getType()}); + MaybeDetach(&CGF, SpawnedScp); return CGF.Builder.CreateCall(F, Src0); } } @@ -647,7 +679,9 @@ static Value *emitFrexpBuiltin(CodeGenFunction &CGF, const CallExpr *E, /// EmitFAbs - Emit a call to @llvm.fabs(). static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) { + CodeGenFunction::IsSpawnedScope SpawnedScp(&CGF); Function *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType()); + MaybeDetach(&CGF, SpawnedScp); llvm::CallInst *Call = CGF.Builder.CreateCall(F, V); Call->setDoesNotAccessMemory(); return Call; @@ -2433,8 +2467,10 @@ static bool TypeRequiresBuiltinLaunder(CodeGenModule &CGM, QualType Ty) { } RValue CodeGenFunction::emitRotate(const CallExpr *E, bool IsRotateRight) { + IsSpawnedScope SpawnedScp(this); llvm::Value *Src = EmitScalarExpr(E->getArg(0)); llvm::Value *ShiftAmt = EmitScalarExpr(E->getArg(1)); + MaybeDetach(this, SpawnedScp); // The builtin's shift arg may have a different type than the source arg and // result, but the LLVM intrinsic uses the same type for all values. @@ -2850,8 +2886,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_fmodl: case Builtin::BI__builtin_fmodf128: { CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); + IsSpawnedScope SpawnedScp(this); Value *Arg1 = EmitScalarExpr(E->getArg(0)); Value *Arg2 = EmitScalarExpr(E->getArg(1)); + MaybeDetach(this, SpawnedScp); return RValue::get(Builder.CreateFRem(Arg1, Arg2, "fmod")); } @@ -3158,7 +3196,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIconj: case Builtin::BIconjf: case Builtin::BIconjl: { + IsSpawnedScope SpawnedScp(this); ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0)); + MaybeDetach(this, SpawnedScp); Value *Real = ComplexVal.first; Value *Imag = ComplexVal.second; Imag = Builder.CreateFNeg(Imag, "neg"); @@ -3209,6 +3249,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_clrsbl: case Builtin::BI__builtin_clrsbll: { // clrsb(x) -> clz(x < 0 ? ~x : x) - 1 or + IsSpawnedScope SpawnedScp(this); Value *ArgValue = EmitScalarExpr(E->getArg(0)); llvm::Type *ArgType = ArgValue->getType(); @@ -3216,6 +3257,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, llvm::Type *ResultType = ConvertType(E->getType()); Value *Zero = llvm::Constant::getNullValue(ArgType); + MaybeDetach(this, SpawnedScp); Value *IsNeg = Builder.CreateICmpSLT(ArgValue, Zero, "isneg"); Value *Inverse = Builder.CreateNot(ArgValue, "not"); Value *Tmp = Builder.CreateSelect(IsNeg, Inverse, ArgValue); @@ -3230,6 +3272,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_ctzl: case Builtin::BI__builtin_ctzll: case Builtin::BI__builtin_ctzg: { + IsSpawnedScope SpawnedScp(this); bool HasFallback = BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_ctzg && E->getNumArgs() > 1; @@ -3243,6 +3286,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, llvm::Type *ResultType = ConvertType(E->getType()); Value *ZeroUndef = Builder.getInt1(HasFallback || getTarget().isCLZForZeroUndef()); + MaybeDetach(this, SpawnedScp); Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef}); if (Result->getType() != ResultType) Result = @@ -3262,6 +3306,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_clzl: case Builtin::BI__builtin_clzll: case Builtin::BI__builtin_clzg: { + IsSpawnedScope SpawnedScp(this); bool HasFallback = BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_clzg && E->getNumArgs() > 1; @@ -3275,6 +3320,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, llvm::Type *ResultType = ConvertType(E->getType()); Value *ZeroUndef = Builder.getInt1(HasFallback || getTarget().isCLZForZeroUndef()); + MaybeDetach(this, SpawnedScp); Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef}); if (Result->getType() != ResultType) Result = @@ -3293,12 +3339,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_ffsl: case Builtin::BI__builtin_ffsll: { // ffs(x) -> x ? cttz(x) + 1 : 0 + IsSpawnedScope SpawnedScp(this); Value *ArgValue = EmitScalarExpr(E->getArg(0)); llvm::Type *ArgType = ArgValue->getType(); Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType); llvm::Type *ResultType = ConvertType(E->getType()); + MaybeDetach(this, SpawnedScp); Value *Tmp = Builder.CreateAdd(Builder.CreateCall(F, {ArgValue, Builder.getTrue()}), llvm::ConstantInt::get(ArgType, 1)); @@ -3314,12 +3362,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_parityl: case Builtin::BI__builtin_parityll: { // parity(x) -> ctpop(x) & 1 + IsSpawnedScope SpawnedScp(this); Value *ArgValue = EmitScalarExpr(E->getArg(0)); llvm::Type *ArgType = ArgValue->getType(); Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType); llvm::Type *ResultType = ConvertType(E->getType()); + MaybeDetach(this, SpawnedScp); Value *Tmp = Builder.CreateCall(F, ArgValue); Value *Result = Builder.CreateAnd(Tmp, llvm::ConstantInt::get(ArgType, 1)); if (Result->getType() != ResultType) @@ -3330,12 +3380,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__lzcnt16: case Builtin::BI__lzcnt: case Builtin::BI__lzcnt64: { + IsSpawnedScope SpawnedScp(this); Value *ArgValue = EmitScalarExpr(E->getArg(0)); llvm::Type *ArgType = ArgValue->getType(); Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType); llvm::Type *ResultType = ConvertType(E->getType()); + MaybeDetach(this, SpawnedScp); Value *Result = Builder.CreateCall(F, {ArgValue, Builder.getFalse()}); if (Result->getType() != ResultType) Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true, @@ -3349,12 +3401,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_popcountl: case Builtin::BI__builtin_popcountll: case Builtin::BI__builtin_popcountg: { + IsSpawnedScope SpawnedScp(this); Value *ArgValue = EmitScalarExpr(E->getArg(0)); llvm::Type *ArgType = ArgValue->getType(); Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType); llvm::Type *ResultType = ConvertType(E->getType()); + MaybeDetach(this, SpawnedScp); Value *Result = Builder.CreateCall(F, ArgValue); if (Result->getType() != ResultType) Result = @@ -3571,6 +3625,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, /*EmittedE=*/nullptr, IsDynamic)); } case Builtin::BI__builtin_prefetch: { + IsSpawnedScope SpawnedScp(this); Value *Locality, *RW, *Address = EmitScalarExpr(E->getArg(0)); // FIXME: Technically these constants should of type 'int', yes? RW = (E->getNumArgs() > 1) ? EmitScalarExpr(E->getArg(1)) : @@ -3579,11 +3634,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, llvm::ConstantInt::get(Int32Ty, 3); Value *Data = llvm::ConstantInt::get(Int32Ty, 1); Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType()); + MaybeDetach(this, SpawnedScp); Builder.CreateCall(F, {Address, RW, Locality, Data}); return RValue::get(nullptr); } case Builtin::BI__builtin_readcyclecounter: { + IsSpawnedScope SpawnedScp(this); Function *F = CGM.getIntrinsic(Intrinsic::readcyclecounter); + MaybeDetach(this, SpawnedScp); return RValue::get(Builder.CreateCall(F)); } case Builtin::BI__builtin_readsteadycounter: { @@ -3591,15 +3649,21 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, return RValue::get(Builder.CreateCall(F)); } case Builtin::BI__builtin___clear_cache: { + IsSpawnedScope SpawnedScp(this); Value *Begin = EmitScalarExpr(E->getArg(0)); Value *End = EmitScalarExpr(E->getArg(1)); Function *F = CGM.getIntrinsic(Intrinsic::clear_cache); + MaybeDetach(this, SpawnedScp); return RValue::get(Builder.CreateCall(F, {Begin, End})); } - case Builtin::BI__builtin_trap: + case Builtin::BI__builtin_trap: { + IsSpawnedScope SpawnedScp(this); + MaybeDetach(this, SpawnedScp); EmitTrapCall(Intrinsic::trap); return RValue::get(nullptr); case Builtin::BI__builtin_verbose_trap: { + IsSpawnedScope SpawnedScp(this); + MaybeDetach(this, SpawnedScp); llvm::DILocation *TrapLocation = Builder.getCurrentDebugLocation(); if (getDebugInfo()) { TrapLocation = getDebugInfo()->CreateTrapFailureMessageFor( @@ -3611,10 +3675,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, EmitTrapCall(Intrinsic::trap); return RValue::get(nullptr); } - case Builtin::BI__debugbreak: + case Builtin::BI__debugbreak: { + IsSpawnedScope SpawnedScp(this); + MaybeDetach(this, SpawnedScp); EmitTrapCall(Intrinsic::debugtrap); return RValue::get(nullptr); + } case Builtin::BI__builtin_unreachable: { + IsSpawnedScope SpawnedScp(this); + MaybeDetach(this, SpawnedScp); EmitUnreachable(E->getExprLoc()); // We do need to preserve an insertion point. @@ -3661,11 +3730,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_islessequal: case Builtin::BI__builtin_islessgreater: case Builtin::BI__builtin_isunordered: { + IsSpawnedScope SpawnedScp(this); // Ordered comparisons: we know the arguments to these are matching scalar // floating point values. CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); Value *LHS = EmitScalarExpr(E->getArg(0)); Value *RHS = EmitScalarExpr(E->getArg(1)); + MaybeDetach(this, SpawnedScp); switch (BuiltinID) { default: llvm_unreachable("Unknown ordered comparison"); @@ -3693,8 +3764,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } case Builtin::BI__builtin_isnan: { + IsSpawnedScope SpawnedScp(this); CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); Value *V = EmitScalarExpr(E->getArg(0)); + MaybeDetach(this, SpawnedScp); if (Value *Result = tryUseTestFPKind(*this, BuiltinID, V)) return RValue::get(Result); return RValue::get( @@ -3711,6 +3784,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } case Builtin::BI__builtin_isinf: { + IsSpawnedScope SpawnedScp(this); CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); Value *V = EmitScalarExpr(E->getArg(0)); if (Value *Result = tryUseTestFPKind(*this, BuiltinID, V)) @@ -3727,6 +3801,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIfinitel: case Builtin::BI__finitel: case Builtin::BI__builtin_isfinite: { + IsSpawnedScope SpawnedScp(this); CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); Value *V = EmitScalarExpr(E->getArg(0)); if (Value *Result = tryUseTestFPKind(*this, BuiltinID, V)) @@ -3737,6 +3812,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } case Builtin::BI__builtin_isnormal: { + IsSpawnedScope SpawnedScp(this); CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); Value *V = EmitScalarExpr(E->getArg(0)); return RValue::get( @@ -4031,10 +4107,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } case Builtin::BI__builtin_isinf_sign: { + IsSpawnedScope SpawnedScp(this); // isinf_sign(x) -> fabs(x) == infinity ? (signbit(x) ? -1 : 1) : 0 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); // FIXME: for strictfp/IEEE-754 we need to not trap on SNaN here. Value *Arg = EmitScalarExpr(E->getArg(0)); + MaybeDetach(this, SpawnedScp); Value *AbsArg = EmitFAbs(*this, Arg); Value *IsInf = Builder.CreateFCmpOEQ( AbsArg, ConstantFP::getInfinity(Arg->getType()), "isinf"); @@ -4071,8 +4149,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_fpclassify: { CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E); // FIXME: for strictfp/IEEE-754 we need to not trap on SNaN here. + IsSpawnedScope SpawnedScp(this); Value *V = EmitScalarExpr(E->getArg(5)); llvm::Type *Ty = ConvertType(E->getArg(5)->getType()); + MaybeDetach(this, SpawnedScp); // Create Result BasicBlock *Begin = Builder.GetInsertBlock(); @@ -4183,10 +4263,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BIbzero: case Builtin::BI__builtin_bzero: { + IsSpawnedScope SpawnedScp(this); Address Dest = EmitPointerWithAlignment(E->getArg(0)); Value *SizeVal = EmitScalarExpr(E->getArg(1)); EmitNonNullArgCheck(Dest, E->getArg(0)->getType(), E->getArg(0)->getExprLoc(), FD, 0); + MaybeDetach(this, SpawnedScp); Builder.CreateMemSet(Dest, Builder.getInt8(0), SizeVal, false); return RValue::get(nullptr); } @@ -4210,6 +4292,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_memcpy: case Builtin::BImempcpy: case Builtin::BI__builtin_mempcpy: { + IsSpawnedScope SpawnedScp(this); Address Dest = EmitPointerWithAlignment(E->getArg(0)); Address Src = EmitPointerWithAlignment(E->getArg(1)); Value *SizeVal = EmitScalarExpr(E->getArg(2)); @@ -4240,6 +4323,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, break; case Builtin::BI__builtin___memcpy_chk: { + IsSpawnedScope SpawnedScp(this); // fold __builtin_memcpy_chk(x, y, cst1, cst2) to memcpy iff cst1<=cst2. Expr::EvalResult SizeResult, DstSizeResult; if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) || @@ -4252,20 +4336,24 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, Address Dest = EmitPointerWithAlignment(E->getArg(0)); Address Src = EmitPointerWithAlignment(E->getArg(1)); Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size); + MaybeDetach(this, SpawnedScp); Builder.CreateMemCpy(Dest, Src, SizeVal, false); return RValue::get(Dest, *this); } case Builtin::BI__builtin_objc_memmove_collectable: { + IsSpawnedScope SpawnedScp(this); Address DestAddr = EmitPointerWithAlignment(E->getArg(0)); Address SrcAddr = EmitPointerWithAlignment(E->getArg(1)); Value *SizeVal = EmitScalarExpr(E->getArg(2)); + MaybeDetach(this, SpawnedScp); CGM.getObjCRuntime().EmitGCMemmoveCollectable(*this, DestAddr, SrcAddr, SizeVal); return RValue::get(DestAddr, *this); } case Builtin::BI__builtin___memmove_chk: { + IsSpawnedScope SpawnedScp(this); // fold __builtin_memmove_chk(x, y, cst1, cst2) to memmove iff cst1<=cst2. Expr::EvalResult SizeResult, DstSizeResult; if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) || @@ -4278,12 +4366,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, Address Dest = EmitPointerWithAlignment(E->getArg(0)); Address Src = EmitPointerWithAlignment(E->getArg(1)); Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size); + MaybeDetach(this, SpawnedScp); Builder.CreateMemMove(Dest, Src, SizeVal, false); return RValue::get(Dest, *this); } case Builtin::BImemmove: case Builtin::BI__builtin_memmove: { + IsSpawnedScope SpawnedScp(this); Address Dest = EmitPointerWithAlignment(E->getArg(0)); Address Src = EmitPointerWithAlignment(E->getArg(1)); Value *SizeVal = EmitScalarExpr(E->getArg(2)); @@ -4294,12 +4384,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } case Builtin::BImemset: case Builtin::BI__builtin_memset: { + IsSpawnedScope SpawnedScp(this); Address Dest = EmitPointerWithAlignment(E->getArg(0)); Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)), Builder.getInt8Ty()); Value *SizeVal = EmitScalarExpr(E->getArg(2)); EmitNonNullArgCheck(Dest, E->getArg(0)->getType(), E->getArg(0)->getExprLoc(), FD, 0); + MaybeDetach(this, SpawnedScp); Builder.CreateMemSet(Dest, ByteVal, SizeVal, false); return RValue::get(Dest, *this); } @@ -4316,6 +4408,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, return RValue::get(nullptr); } case Builtin::BI__builtin___memset_chk: { + IsSpawnedScope SpawnedScp(this); // fold __builtin_memset_chk(x, y, cst1, cst2) to memset iff cst1<=cst2. Expr::EvalResult SizeResult, DstSizeResult; if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) || @@ -4329,6 +4422,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)), Builder.getInt8Ty()); Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size); + MaybeDetach(this, SpawnedScp); Builder.CreateMemSet(Dest, ByteVal, SizeVal, false); return RValue::get(Dest, *this); } @@ -4451,34 +4545,46 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, llvm::ConstantInt::get(Int32Ty, Offset))); } case Builtin::BI__builtin_return_address: { + IsSpawnedScope SpawnedScp(this); Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0), getContext().UnsignedIntTy); Function *F = CGM.getIntrinsic(Intrinsic::returnaddress); + MaybeDetach(this, SpawnedScp); return RValue::get(Builder.CreateCall(F, Depth)); } case Builtin::BI_ReturnAddress: { + IsSpawnedScope SpawnedScp(this); Function *F = CGM.getIntrinsic(Intrinsic::returnaddress); + MaybeDetach(this, SpawnedScp); return RValue::get(Builder.CreateCall(F, Builder.getInt32(0))); } case Builtin::BI__builtin_frame_address: { + IsSpawnedScope SpawnedScp(this); Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0), getContext().UnsignedIntTy); Function *F = CGM.getIntrinsic(Intrinsic::frameaddress, AllocaInt8PtrTy); + MaybeDetach(this, SpawnedScp); return RValue::get(Builder.CreateCall(F, Depth)); } case Builtin::BI__builtin_extract_return_addr: { + IsSpawnedScope SpawnedScp(this); Value *Address = EmitScalarExpr(E->getArg(0)); + MaybeDetach(this, SpawnedScp); Value *Result = getTargetHooks().decodeReturnAddress(*this, Address); return RValue::get(Result); } case Builtin::BI__builtin_frob_return_addr: { + IsSpawnedScope SpawnedScp(this); Value *Address = EmitScalarExpr(E->getArg(0)); + MaybeDetach(this, SpawnedScp); Value *Result = getTargetHooks().encodeReturnAddress(*this, Address); return RValue::get(Result); } case Builtin::BI__builtin_dwarf_sp_column: { + IsSpawnedScope SpawnedScp(this); llvm::IntegerType *Ty = cast(ConvertType(E->getType())); + MaybeDetach(this, SpawnedScp); int Column = getTargetHooks().getDwarfEHStackPointer(CGM); if (Column == -1) { CGM.ErrorUnsupported(E, "__builtin_dwarf_sp_column"); @@ -4487,7 +4593,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, return RValue::get(llvm::ConstantInt::get(Ty, Column, true)); } case Builtin::BI__builtin_init_dwarf_reg_size_table: { + IsSpawnedScope SpawnedScp(this); Value *Address = EmitScalarExpr(E->getArg(0)); + MaybeDetach(this, SpawnedScp); if (getTargetHooks().initDwarfEHRegSizeTable(*this, Address)) CGM.ErrorUnsupported(E, "__builtin_init_dwarf_reg_size_table"); return RValue::get(llvm::UndefValue::get(ConvertType(E->getType()))); @@ -4516,6 +4624,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, return RValue::get(nullptr); } case Builtin::BI__builtin_extend_pointer: { + IsSpawnedScope SpawnedScp(this); // Extends a pointer to the size of an _Unwind_Word, which is // uint64_t on all platforms. Generally this gets poked into a // register and eventually used as an address, so if the @@ -4528,6 +4637,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, // Cast the pointer to intptr_t. Value *Ptr = EmitScalarExpr(E->getArg(0)); + MaybeDetach(this, SpawnedScp); Value *Result = Builder.CreatePtrToInt(Ptr, IntPtrTy, "extend.cast"); // If that's 64 bits, we're done. @@ -4754,6 +4864,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, return RValue::get(EmitNontemporalStore(*this, E)); case Builtin::BI__c11_atomic_is_lock_free: case Builtin::BI__atomic_is_lock_free: { + IsSpawnedScope SpawnedScp(this); // Call "bool __atomic_is_lock_free(size_t size, void *ptr)". For the // __c11 builtin, ptr is 0 (indicating a properly-aligned object), since // _Atomic(T) is always properly-aligned. @@ -4771,11 +4882,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, CGM.getTypes().arrangeBuiltinFunctionCall(E->getType(), Args); llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FuncInfo); llvm::FunctionCallee Func = CGM.CreateRuntimeFunction(FTy, LibCallName); + SpawnedScp.RestoreOldScope(); return EmitCall(FuncInfo, CGCallee::forDirect(Func), ReturnValueSlot(), Args); } case Builtin::BI__atomic_test_and_set: { + IsSpawnedScope SpawnedScp(this); // Look at the argument type to determine whether this is a volatile // operation. The parameter type is always volatile. QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType(); @@ -4787,6 +4900,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, Value *NewVal = Builder.getInt8(1); Value *Order = EmitScalarExpr(E->getArg(1)); + MaybeDetach(this, SpawnedScp); if (isa(Order)) { int ord = cast(Order)->getZExtValue(); AtomicRMWInst *Result = nullptr; @@ -4861,6 +4975,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } case Builtin::BI__atomic_clear: { + IsSpawnedScope SpawnedScp(this); QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType(); bool Volatile = PtrTy->castAs()->getPointeeType().isVolatileQualified(); @@ -4869,6 +4984,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, Ptr = Ptr.withElementType(Int8Ty); Value *NewVal = Builder.getInt8(0); Value *Order = EmitScalarExpr(E->getArg(1)); + MaybeDetach(this, SpawnedScp); if (isa(Order)) { int ord = cast(Order)->getZExtValue(); StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile); @@ -4988,6 +5104,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_signbit: case Builtin::BI__builtin_signbitf: case Builtin::BI__builtin_signbitl: { + IsSpawnedScope SpawnedScp(this); + MaybeDetach(this, SpawnedScp); return RValue::get( Builder.CreateZExt(EmitSignBit(*this, EmitScalarExpr(E->getArg(0))), ConvertType(E->getType()))); @@ -5040,6 +5158,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_subc: case Builtin::BI__builtin_subcl: case Builtin::BI__builtin_subcll: { + IsSpawnedScope SpawnedScp(this); // We translate all of these builtins from expressions of the form: // int x = ..., y = ..., carryin = ..., carryout, result; @@ -5083,6 +5202,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, IntrinsicId = llvm::Intrinsic::usub_with_overflow; break; } + MaybeDetach(this, SpawnedScp); // Construct our resulting LLVM IR expression. llvm::Value *Carry1; @@ -5100,6 +5220,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_add_overflow: case Builtin::BI__builtin_sub_overflow: case Builtin::BI__builtin_mul_overflow: { + IsSpawnedScope SpawnedScp(this); const clang::Expr *LeftArg = E->getArg(0); const clang::Expr *RightArg = E->getArg(1); const clang::Expr *ResultArg = E->getArg(2); @@ -5159,6 +5280,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, llvm::Value *Left = EmitScalarExpr(LeftArg); llvm::Value *Right = EmitScalarExpr(RightArg); Address ResultPtr = EmitPointerWithAlignment(ResultArg); + MaybeDetach(this, SpawnedScp); // Extend each operand to the encompassing type. Left = Builder.CreateIntCast(Left, EncompassingLLVMTy, LeftInfo.Signed); @@ -5210,6 +5332,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_smul_overflow: case Builtin::BI__builtin_smull_overflow: case Builtin::BI__builtin_smulll_overflow: { + IsSpawnedScope SpawnedScp(this); // We translate all of these builtins directly to the relevant llvm IR node. @@ -5253,7 +5376,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, IntrinsicId = llvm::Intrinsic::smul_with_overflow; break; } - + MaybeDetach(this, SpawnedScp); llvm::Value *Carry; llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry); @@ -5287,8 +5410,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, // __noop always evaluates to an integer literal zero. return RValue::get(ConstantInt::get(IntTy, 0)); case Builtin::BI__builtin_call_with_static_chain: { + IsSpawnedScope SpawnedScp(this); const CallExpr *Call = cast(E->getArg(0)); const Expr *Chain = E->getArg(1); + SpawnedScp.RestoreOldScope(); return EmitCall(Call->getCallee()->getType(), EmitCallee(Call->getCallee()), Call, ReturnValue, EmitScalarExpr(Chain)); @@ -5963,19 +6088,25 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, } case Builtin::BI__builtin_store_half: case Builtin::BI__builtin_store_halff: { + IsSpawnedScope SpawnedScp(this); Value *Val = EmitScalarExpr(E->getArg(0)); Address Address = EmitPointerWithAlignment(E->getArg(1)); + MaybeDetach(this, SpawnedScp); Value *HalfVal = Builder.CreateFPTrunc(Val, Builder.getHalfTy()); Builder.CreateStore(HalfVal, Address); return RValue::get(nullptr); } case Builtin::BI__builtin_load_half: { + IsSpawnedScope SpawnedScp(this); Address Address = EmitPointerWithAlignment(E->getArg(0)); + MaybeDetach(this, SpawnedScp); Value *HalfVal = Builder.CreateLoad(Address); return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getDoubleTy())); } case Builtin::BI__builtin_load_halff: { + IsSpawnedScope SpawnedScp(this); Address Address = EmitPointerWithAlignment(E->getArg(0)); + MaybeDetach(this, SpawnedScp); Value *HalfVal = Builder.CreateLoad(Address); return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getFloatTy())); } @@ -6111,18 +6242,23 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, return RValue::get(Str.getPointer()); } } + IsSpawnedScope SpawnedScp(this); // If this is an alias for a lib function (e.g. __builtin_sin), emit // the call using the normal call path, but using the unmangled // version of the function name. - if (getContext().BuiltinInfo.isLibFunction(BuiltinID)) + if (getContext().BuiltinInfo.isLibFunction(BuiltinID)) { + SpawnedScp.RestoreOldScope(); return emitLibraryCall(*this, FD, E, CGM.getBuiltinLibFunction(FD, BuiltinID)); + } // If this is a predefined lib function (e.g. malloc), emit the call // using exactly the normal call path. - if (getContext().BuiltinInfo.isPredefinedLibFunction(BuiltinID)) + if (getContext().BuiltinInfo.isPredefinedLibFunction(BuiltinID)) { + SpawnedScp.RestoreOldScope(); return emitLibraryCall(*this, FD, E, CGM.getRawFunctionPointer(FD)); + } // Check that a call to a target specific builtin has the correct target // features. @@ -6192,6 +6328,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, Args.push_back(ArgValue); } + MaybeDetach(this, SpawnedScp); Value *V = Builder.CreateCall(F, Args); QualType BuiltinRetType = E->getType(); @@ -6234,6 +6371,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, ReturnValue = ReturnValueSlot(DestPtr, false); } + SpawnedScp.RestoreOldScope(); // Now see if we can emit a target-specific builtin. if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E, ReturnValue)) { switch (EvalKind) { diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index d7ebffa8c5e4e02..283fe88bf2f7299 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -5018,6 +5018,8 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, bool IsVirtualFunctionPointerThunk) { // FIXME: We no longer need the types from CallArgs; lift up and simplify. + IsSpawnedScope SpawnedScp(this); + assert(Callee.isOrdinary() || Callee.isVirtual()); // Handle struct-return functions by passing a pointer to the @@ -5548,6 +5550,15 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, // 3. Perform the actual call. + // If this call is detached, start the detach, if it hasn't yet been started. + if (SpawnedScp.OldScopeIsSpawned()) { + SpawnedScp.RestoreOldScope(); + assert(CurDetachScope && + "A call was spawned, but no detach scope was pushed."); + if (!CurDetachScope->IsDetachStarted()) + CurDetachScope->StartDetach(); + } + // Deactivate any cleanups that we're supposed to do immediately before // the call. if (!CallArgs.getCleanupsToDeactivate().empty()) @@ -5681,6 +5692,12 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, AllocAlignAttrEmitter AllocAlignAttrEmitter(*this, TargetDecl, CallArgs); Attrs = AllocAlignAttrEmitter.TryEmitAsCallSiteAttribute(Attrs); + // If this call might lead to exit() make sure the runtime can + // be shutdown cleanly. + if (CurSyncRegion && !ScopeIsSynced && !InvokeDest && + Attrs.hasFnAttr(llvm::Attribute::NoReturn)) + EmitImplicitSyncCleanup(nullptr); + // Emit the actual call/invoke instruction. llvm::CallBase *CI; if (!InvokeDest) { diff --git a/clang/lib/CodeGen/CGCilk.cpp b/clang/lib/CodeGen/CGCilk.cpp new file mode 100644 index 000000000000000..08b705eafa61bd1 --- /dev/null +++ b/clang/lib/CodeGen/CGCilk.cpp @@ -0,0 +1,422 @@ +//===--- CGCilk.cpp - Emit LLVM Code for Cilk expressions -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This contains code dealing with code generation of Cilk statements and +// expressions. +// +//===----------------------------------------------------------------------===// + +#include "CodeGenFunction.h" +#include "CGCleanup.h" + +using namespace clang; +using namespace CodeGen; + +CodeGenFunction::IsSpawnedScope::IsSpawnedScope(CodeGenFunction *CGF) + : CGF(CGF), OldIsSpawned(CGF->IsSpawned), + OldSpawnedCleanup(CGF->SpawnedCleanup) { + CGF->IsSpawned = false; + CGF->SpawnedCleanup = OldIsSpawned; +} + +CodeGenFunction::IsSpawnedScope::~IsSpawnedScope() { + RestoreOldScope(); +} + +bool CodeGenFunction::IsSpawnedScope::OldScopeIsSpawned() const { + return OldIsSpawned; +} + +void CodeGenFunction::IsSpawnedScope::RestoreOldScope() { + CGF->IsSpawned = OldIsSpawned; + CGF->SpawnedCleanup = OldSpawnedCleanup; +} + +void CodeGenFunction::EmitImplicitSyncCleanup(llvm::Instruction *SyncRegion) { + llvm::Instruction *SR = SyncRegion; + // If a sync region wasn't specified with this cleanup initially, try to grab + // the current sync region. + if (!SR && CurSyncRegion && CurSyncRegion->getSyncRegionStart()) + SR = CurSyncRegion->getSyncRegionStart(); + if (!SR) + return; + + llvm::BasicBlock *ContinueBlock = createBasicBlock("sync.continue"); + Builder.CreateSync(ContinueBlock, SR); + EmitBlockAfterUses(ContinueBlock); + if (getLangOpts().Exceptions && !CurFn->doesNotThrow()) + EmitCallOrInvoke(CGM.getIntrinsic(llvm::Intrinsic::sync_unwind), { SR }); +} + +void CodeGenFunction::DetachScope::CreateTaskFrameEHState() { + // Save the old EH state. + OldEHResumeBlock = CGF.EHResumeBlock; + CGF.EHResumeBlock = nullptr; + OldExceptionSlot = CGF.ExceptionSlot; + CGF.ExceptionSlot = nullptr; + OldEHSelectorSlot = CGF.EHSelectorSlot; + CGF.EHSelectorSlot = nullptr; + OldNormalCleanupDest = CGF.NormalCleanupDest; + CGF.NormalCleanupDest = Address::invalid(); +} + +void CodeGenFunction::DetachScope::CreateDetachedEHState() { + // Save the old EH state. + TFEHResumeBlock = CGF.EHResumeBlock; + CGF.EHResumeBlock = nullptr; + TFExceptionSlot = CGF.ExceptionSlot; + CGF.ExceptionSlot = nullptr; + TFEHSelectorSlot = CGF.EHSelectorSlot; + CGF.EHSelectorSlot = nullptr; + TFNormalCleanupDest = CGF.NormalCleanupDest; + CGF.NormalCleanupDest = Address::invalid(); +} + +llvm::BasicBlock *CodeGenFunction::DetachScope::RestoreTaskFrameEHState() { + llvm::BasicBlock *NestedEHResumeBlock = CGF.EHResumeBlock; + CGF.EHResumeBlock = TFEHResumeBlock; + CGF.ExceptionSlot = TFExceptionSlot; + CGF.EHSelectorSlot = TFEHSelectorSlot; + CGF.NormalCleanupDest = TFNormalCleanupDest; + return NestedEHResumeBlock; +} + +llvm::BasicBlock *CodeGenFunction::DetachScope::RestoreParentEHState() { + llvm::BasicBlock *NestedEHResumeBlock = CGF.EHResumeBlock; + CGF.EHResumeBlock = OldEHResumeBlock; + CGF.ExceptionSlot = OldExceptionSlot; + CGF.EHSelectorSlot = OldEHSelectorSlot; + CGF.NormalCleanupDest = OldNormalCleanupDest; + return NestedEHResumeBlock; +} + +void CodeGenFunction::DetachScope::EnsureTaskFrame() { + if (!TaskFrame) { + llvm::Function *TaskFrameCreate = + CGF.CGM.getIntrinsic(llvm::Intrinsic::taskframe_create); + TaskFrame = CGF.Builder.CreateCall(TaskFrameCreate); + + // Create a new alloca insertion point within the task frame. + OldAllocaInsertPt = CGF.AllocaInsertPt; + llvm::Value *Undef = llvm::UndefValue::get(CGF.Int32Ty); + CGF.AllocaInsertPt = new llvm::BitCastInst(Undef, CGF.Int32Ty, "", + CGF.Builder.GetInsertBlock()); + // SavedDetachedAllocaInsertPt = CGF.AllocaInsertPt; + + CreateTaskFrameEHState(); + + CGF.pushFullExprCleanup( + static_cast(EHCleanup | LifetimeMarker | TaskExit), + TaskFrame); + } +} + +void CodeGenFunction::DetachScope::InitDetachScope() { + // Create the detached and continue blocks. + DetachedBlock = CGF.createBasicBlock("det.achd"); + ContinueBlock = CGF.createBasicBlock("det.cont"); +} + +void CodeGenFunction::DetachScope::PushSpawnedTaskTerminate() { + CGF.pushFullExprCleanupImpl( + // This cleanup should not be a TaskExit, because we've pushed a TaskExit + // cleanup onto EHStack already, corresponding with the taskframe. + static_cast(EHCleanup | LifetimeMarker), + CGF.CurSyncRegion->getSyncRegionStart()); +} + +void CodeGenFunction::DetachScope::StartDetach() { + InitDetachScope(); + + // Set the detached block as the new alloca insertion point. + TFAllocaInsertPt = CGF.AllocaInsertPt; + llvm::Value *Undef = llvm::UndefValue::get(CGF.Int32Ty); + CGF.AllocaInsertPt = new llvm::BitCastInst(Undef, CGF.Int32Ty, "", + DetachedBlock); + + if (StmtCleanupsScope) + StmtCleanupsScope->DoDetach(); + else + PushSpawnedTaskTerminate(); + + // Create the detach + Detach = CGF.Builder.CreateDetach(DetachedBlock, ContinueBlock, + CGF.CurSyncRegion->getSyncRegionStart()); + + // Save the old EH state. + CreateDetachedEHState(); + + // Emit the detached block. + CGF.EmitBlock(DetachedBlock); + + // Link this detach block to the task frame, if it exists. + if (TaskFrame) { + llvm::Function *TaskFrameUse = + CGF.CGM.getIntrinsic(llvm::Intrinsic::taskframe_use); + CGF.Builder.CreateCall(TaskFrameUse, { TaskFrame }); + } + + // For Cilk, ensure that the detached task is implicitly synced before it + // returns. + CGF.PushSyncRegion()->addImplicitSync(); + + // Initialize lifetime intrinsics for the reference temporary. + if (RefTmp.isValid()) { + switch (RefTmpSD) { + case SD_Automatic: + case SD_FullExpression: + if (auto *Size = CGF.EmitLifetimeStart( + CGF.CGM.getDataLayout().getTypeAllocSize(RefTmp.getElementType()), + RefTmp.getPointer())) { + if (RefTmpSD == SD_Automatic) + CGF.pushCleanupAfterFullExpr(NormalEHLifetimeMarker, + RefTmp, Size); + else + CGF.pushFullExprCleanup(NormalEHLifetimeMarker, + RefTmp, Size); + } + break; + default: + break; + } + } + + DetachStarted = true; +} + +void CodeGenFunction::DetachScope::CleanupDetach() { + if (!DetachStarted || DetachCleanedUp) + return; + + // Pop the sync region for the detached task. + CGF.PopSyncRegion(); + DetachCleanedUp = true; +} + +void CodeGenFunction::DetachScope::EmitTaskEnd() { + if (!CGF.HaveInsertPoint()) + return; + + // The CFG path into the spawned statement should terminate with a `reattach'. + CGF.Builder.CreateReattach(ContinueBlock, + CGF.CurSyncRegion->getSyncRegionStart()); +} + +static void EmitTrivialLandingPad(CodeGenFunction &CGF, + llvm::BasicBlock *TempInvokeDest) { + // Save the current IR generation state. + CGBuilderTy::InsertPoint savedIP = CGF.Builder.saveAndClearIP(); + + // Insert a simple cleanup landingpad at the start of TempInvokeDest. + TempInvokeDest->setName("lpad"); + CGF.EmitBlock(TempInvokeDest); + CGF.Builder.SetInsertPoint(&TempInvokeDest->front()); + + llvm::LandingPadInst *LPadInst = + CGF.Builder.CreateLandingPad(llvm::StructType::get(CGF.Int8PtrTy, + CGF.Int32Ty), 0); + + llvm::Value *LPadExn = CGF.Builder.CreateExtractValue(LPadInst, 0); + CGF.Builder.CreateStore(LPadExn, CGF.getExceptionSlot()); + llvm::Value *LPadSel = CGF.Builder.CreateExtractValue(LPadInst, 1); + CGF.Builder.CreateStore(LPadSel, CGF.getEHSelectorSlot()); + + LPadInst->setCleanup(true); + + // Restore the old IR generation state. + CGF.Builder.restoreIP(savedIP); +} + +void CodeGenFunction::DetachScope::FinishDetach() { + if (!DetachStarted) + return; + + CleanupDetach(); + // Pop the detached_rethrow. + CGF.PopCleanupBlock(); + + EmitTaskEnd(); + + // Restore the alloca insertion point to taskframe_create. + { + llvm::Instruction *Ptr = CGF.AllocaInsertPt; + CGF.AllocaInsertPt = TFAllocaInsertPt; + SavedDetachedAllocaInsertPt = nullptr; + Ptr->eraseFromParent(); + } + + // Restore the task frame's EH state. + llvm::BasicBlock *TaskResumeBlock = RestoreTaskFrameEHState(); + assert(!TaskResumeBlock && "Emission of task produced a resume block"); + + llvm::BasicBlock *InvokeDest = nullptr; + if (TempInvokeDest) { + InvokeDest = CGF.getInvokeDest(); + if (InvokeDest) + TempInvokeDest->replaceAllUsesWith(InvokeDest); + else { + InvokeDest = TempInvokeDest; + EmitTrivialLandingPad(CGF, TempInvokeDest); + TempInvokeDest = nullptr; + } + } + + // Emit the continue block. + CGF.EmitBlock(ContinueBlock); + + // If the detached-rethrow handler is used, add an unwind destination to the + // detach. + if (InvokeDest) { + CGBuilderTy::InsertPoint SavedIP = CGF.Builder.saveIP(); + CGF.Builder.SetInsertPoint(Detach); + // Create the new detach instruction. + llvm::DetachInst *NewDetach = CGF.Builder.CreateDetach( + Detach->getDetached(), Detach->getContinue(), InvokeDest, + Detach->getSyncRegion()); + // Remove the old detach. + Detach->eraseFromParent(); + Detach = NewDetach; + CGF.Builder.restoreIP(SavedIP); + } + + // Pop the taskframe. + CGF.PopCleanupBlock(); + + // Restore the alloca insertion point. + { + llvm::Instruction *Ptr = CGF.AllocaInsertPt; + CGF.AllocaInsertPt = OldAllocaInsertPt; + TFAllocaInsertPt = nullptr; + Ptr->eraseFromParent(); + } + + // Restore the original EH state. + llvm::BasicBlock *NestedEHResumeBlock = RestoreParentEHState(); + + if (TempInvokeDest) { + if (llvm::BasicBlock *InvokeDest = CGF.getInvokeDest()) { + TempInvokeDest->replaceAllUsesWith(InvokeDest); + } else + EmitTrivialLandingPad(CGF, TempInvokeDest); + } + + // If invocations in the parallel task led to the creation of EHResumeBlock, + // we need to create for outside the task. In particular, the new + // EHResumeBlock must use an ExceptionSlot and EHSelectorSlot allocated + // outside of the task. + if (NestedEHResumeBlock) { + if (!NestedEHResumeBlock->use_empty()) { + // Translate the nested EHResumeBlock into an appropriate EHResumeBlock in + // the outer scope. + NestedEHResumeBlock->replaceAllUsesWith( + CGF.getEHResumeBlock( + isa(NestedEHResumeBlock->getTerminator()))); + } + delete NestedEHResumeBlock; + } +} + +Address CodeGenFunction::DetachScope::CreateDetachedMemTemp( + QualType Ty, StorageDuration SD, const Twine &Name) { + // There shouldn't be multiple reference temporaries needed. + assert(!RefTmp.isValid() && + "Already created a reference temporary in this detach scope."); + + // Create the reference temporary + RefTmp = CGF.CreateMemTemp(Ty, Name); + RefTmpSD = SD; + + return RefTmp; +} + +CodeGenFunction::TaskFrameScope::TaskFrameScope(CodeGenFunction &CGF) + : CGF(CGF) { + if (!CGF.CurSyncRegion) + return; + + llvm::Function *TaskFrameCreate = + CGF.CGM.getIntrinsic(llvm::Intrinsic::taskframe_create); + TaskFrame = CGF.Builder.CreateCall(TaskFrameCreate); + + // Create a new alloca insertion point within the task frame. + OldAllocaInsertPt = CGF.AllocaInsertPt; + llvm::Value *Undef = llvm::UndefValue::get(CGF.Int32Ty); + CGF.AllocaInsertPt = new llvm::BitCastInst(Undef, CGF.Int32Ty, "", + CGF.Builder.GetInsertBlock()); + + // Save the old EH state. + OldEHResumeBlock = CGF.EHResumeBlock; + CGF.EHResumeBlock = nullptr; + OldExceptionSlot = CGF.ExceptionSlot; + CGF.ExceptionSlot = nullptr; + OldEHSelectorSlot = CGF.EHSelectorSlot; + CGF.EHSelectorSlot = nullptr; + OldNormalCleanupDest = CGF.NormalCleanupDest; + CGF.NormalCleanupDest = Address::invalid(); + + CGF.pushFullExprCleanup( + static_cast(NormalAndEHCleanup | LifetimeMarker | TaskExit), + this); +} + +CodeGenFunction::TaskFrameScope::~TaskFrameScope() { + if (!CGF.CurSyncRegion) + return; + + // Pop the taskframe. + CGF.PopCleanupBlock(); + + // Restore the alloca insertion point. + { + llvm::Instruction *Ptr = CGF.AllocaInsertPt; + CGF.AllocaInsertPt = OldAllocaInsertPt; + Ptr->eraseFromParent(); + } + + // Restore the original EH state. + llvm::BasicBlock *NestedEHResumeBlock = CGF.EHResumeBlock; + CGF.EHResumeBlock = OldEHResumeBlock; + CGF.ExceptionSlot = OldExceptionSlot; + CGF.EHSelectorSlot = OldEHSelectorSlot; + CGF.NormalCleanupDest = OldNormalCleanupDest; + + if (TempInvokeDest) { + if (llvm::BasicBlock *InvokeDest = CGF.getInvokeDest()) { + TempInvokeDest->replaceAllUsesWith(InvokeDest); + } else + EmitTrivialLandingPad(CGF, TempInvokeDest); + + if (TempInvokeDest->use_empty()) + delete TempInvokeDest; + } + + // If invocations in the parallel task led to the creation of EHResumeBlock, + // we need to create for outside the task. In particular, the new + // EHResumeBlock must use an ExceptionSlot and EHSelectorSlot allocated + // outside of the task. + if (NestedEHResumeBlock) { + if (!NestedEHResumeBlock->use_empty()) { + // Translate the nested EHResumeBlock into an appropriate EHResumeBlock in + // the outer scope. + NestedEHResumeBlock->replaceAllUsesWith( + CGF.getEHResumeBlock( + isa(NestedEHResumeBlock->getTerminator()))); + } + delete NestedEHResumeBlock; + } +} + +llvm::Instruction *CodeGenFunction::EmitSyncRegionStart() { + // Start the sync region. To ensure the syncregion.start call dominates all + // uses of the generated token, we insert this call at the alloca insertion + // point. + llvm::Instruction *SRStart = llvm::CallInst::Create( + CGM.getIntrinsic(llvm::Intrinsic::syncregion_start), + "syncreg", AllocaInsertPt); + return SRStart; +} diff --git a/clang/lib/CodeGen/CGCleanup.cpp b/clang/lib/CodeGen/CGCleanup.cpp index 4e210a9e3c95fff..5d83c300a719c1e 100644 --- a/clang/lib/CodeGen/CGCleanup.cpp +++ b/clang/lib/CodeGen/CGCleanup.cpp @@ -176,6 +176,8 @@ void *EHScopeStack::pushCleanup(CleanupKind Kind, size_t Size) { InnermostEHScope = stable_begin(); if (IsLifetimeMarker) Scope->setLifetimeMarker(); + if (Kind & TaskExit) + Scope->setTaskExit(); // With Windows -EHa, Invoke llvm.seh.scope.begin() for EHCleanup // If exceptions are disabled/ignored and SEH is not in use, then there is no @@ -409,9 +411,10 @@ void CodeGenFunction::ResolveBranchFixups(llvm::BasicBlock *Block) { /// Pops cleanup blocks until the given savepoint is reached. void CodeGenFunction::PopCleanupBlocks( EHScopeStack::stable_iterator Old, - std::initializer_list ValuesToReload) { + std::initializer_list ValuesToReload, bool AfterSync) { assert(Old.isValid()); + bool EmitSync = AfterSync; bool HadBranches = false; while (EHStack.stable_begin() != Old) { EHCleanupScope &Scope = cast(*EHStack.begin()); @@ -423,7 +426,8 @@ void CodeGenFunction::PopCleanupBlocks( bool FallThroughIsBranchThrough = Old.strictlyEncloses(Scope.getEnclosingNormalCleanup()); - PopCleanupBlock(FallThroughIsBranchThrough); + PopCleanupBlock(FallThroughIsBranchThrough, EmitSync); + EmitSync = false; } // If we didn't have any branches, the insertion point before cleanups must @@ -465,8 +469,8 @@ void CodeGenFunction::PopCleanupBlocks( /// cleanups from the given savepoint in the lifetime-extended cleanups stack. void CodeGenFunction::PopCleanupBlocks( EHScopeStack::stable_iterator Old, size_t OldLifetimeExtendedSize, - std::initializer_list ValuesToReload) { - PopCleanupBlocks(Old, ValuesToReload); + std::initializer_list ValuesToReload, bool AfterSync) { + PopCleanupBlocks(Old, ValuesToReload, AfterSync); // Move our deferred cleanups onto the EH stack. for (size_t I = OldLifetimeExtendedSize, @@ -635,7 +639,8 @@ static void destroyOptimisticNormalEntry(CodeGenFunction &CGF, /// current insertion point is threaded through the cleanup, as are /// any branch fixups on the cleanup. void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough, - bool ForDeactivation) { + bool ForDeactivation, + bool AfterSync) { assert(!EHStack.empty() && "cleanup stack is empty!"); assert(isa(*EHStack.begin()) && "top not a cleanup!"); EHCleanupScope &Scope = cast(*EHStack.begin()); @@ -802,6 +807,11 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough, EmitSehTryScopeEnd(); } + if (AfterSync) { + EmitImplicitSyncCleanup(); + return PopCleanupBlock(FallthroughIsBranchThrough, false); + } + destroyOptimisticNormalEntry(*this, Scope); Scope.MarkEmitted(); EHStack.popCleanup(); @@ -940,6 +950,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough, // IV. Pop the cleanup and emit it. Scope.MarkEmitted(); + if (AfterSync) + EmitImplicitSyncCleanup(); EHStack.popCleanup(); assert(EHStack.hasNormalCleanups() == HasEnclosingCleanups); @@ -1093,13 +1105,17 @@ bool CodeGenFunction::isObviouslyBranchWithoutCleanups(JumpDest Dest) const { /// be known, in which case this will require a fixup. /// /// As a side-effect, this method clears the insertion point. -void CodeGenFunction::EmitBranchThroughCleanup(JumpDest Dest) { +void CodeGenFunction::EmitBranchThroughCleanup(JumpDest Dest, bool AfterSync) { assert(Dest.getScopeDepth().encloses(EHStack.stable_begin()) && "stale jump destination"); if (!HaveInsertPoint()) return; + // If needed, insert an implicit _Cilk_sync before the cleanups. + if (AfterSync) + EmitImplicitSyncCleanup(); + // Create the branch. llvm::BranchInst *BI = Builder.CreateBr(Dest.getBlock()); diff --git a/clang/lib/CodeGen/CGCleanup.h b/clang/lib/CodeGen/CGCleanup.h index c73c97146abc4d4..d8b098cfb5e6c61 100644 --- a/clang/lib/CodeGen/CGCleanup.h +++ b/clang/lib/CodeGen/CGCleanup.h @@ -95,6 +95,9 @@ class EHScope { LLVM_PREFERRED_TYPE(bool) unsigned TestFlagInEHCleanup : 1; + /// Whether this cleanup marks the exit of a task. + unsigned IsTaskExit : 1; + /// The amount of extra storage needed by the Cleanup. /// Always a multiple of the scope-stack alignment. unsigned CleanupSize : 12; @@ -354,6 +357,7 @@ class alignas(8) EHCleanupScope : public EHScope { CleanupBits.IsLifetimeMarker = false; CleanupBits.TestFlagInNormalCleanup = false; CleanupBits.TestFlagInEHCleanup = false; + CleanupBits.IsTaskExit = false; CleanupBits.CleanupSize = cleanupSize; assert(CleanupBits.CleanupSize == cleanupSize && "cleanup size overflow"); @@ -407,6 +411,9 @@ class alignas(8) EHCleanupScope : public EHScope { return CleanupBits.TestFlagInEHCleanup; } + bool isTaskExit() const { return CleanupBits.IsTaskExit; } + void setTaskExit() { CleanupBits.IsTaskExit = true; } + unsigned getFixupDepth() const { return FixupDepth; } EHScopeStack::stable_iterator getEnclosingNormalCleanup() const { return EnclosingNormal; diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index c3251bb5ab56579..0b0bc8602b78f80 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -2223,6 +2223,9 @@ void CodeGenFunction::pushDestroy(QualType::DestructionKind dtorKind, void CodeGenFunction::pushDestroy(CleanupKind cleanupKind, Address addr, QualType type, Destroyer *destroyer, bool useEHCleanupForArray) { + if (SpawnedCleanup) + return pushLifetimeExtendedDestroy(cleanupKind, addr, type, destroyer, + useEHCleanupForArray); pushFullExprCleanup(cleanupKind, addr, type, destroyer, useEHCleanupForArray); } diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp index bb2ed237ee9f35f..feb9e71714a0fca 100644 --- a/clang/lib/CodeGen/CGException.cpp +++ b/clang/lib/CodeGen/CGException.cpp @@ -620,16 +620,27 @@ void CodeGenFunction::EmitEndEHSpec(const Decl *D) { } void CodeGenFunction::EmitCXXTryStmt(const CXXTryStmt &S) { - const llvm::Triple &T = Target.getTriple(); - // If we encounter a try statement on in an OpenMP target region offloaded to - // a GPU, we treat it as a basic block. - const bool IsTargetDevice = - (CGM.getLangOpts().OpenMPIsTargetDevice && (T.isNVPTX() || T.isAMDGCN())); - if (!IsTargetDevice) - EnterCXXTryStmt(S); - EmitStmt(S.getTryBlock()); - if (!IsTargetDevice) - ExitCXXTryStmt(S); + TaskFrameScope TFScope(*this); + EnterCXXTryStmt(S); + { + // If compiling Cilk code, create a nested sync region, with an implicit + // sync, for the try-catch. + // FIXME KITSUNE: Since we know that we will not be compiling Cilk, can we + // clean this up. + bool CompilingCilk = false; + SyncedScopeRAII SyncedScp(*this); + if (CompilingCilk) { + PushSyncRegion(); + if (isa(S.getTryBlock())) + ScopeIsSynced = true; + } + EmitStmt(S.getTryBlock()); + + // Pop the nested sync region after the try block. + if (CompilingCilk) + PopSyncRegion(); + } + ExitCXXTryStmt(S); } void CodeGenFunction::EnterCXXTryStmt(const CXXTryStmt &S, bool IsFnTryBlock) { @@ -873,6 +884,8 @@ llvm::BasicBlock *CodeGenFunction::EmitLandingPad() { case EHScope::Cleanup: // If we have a cleanup, remember that. hasCleanup = (hasCleanup || cast(*I).isEHCleanup()); + if (cast(*I).isTaskExit()) + goto done; continue; case EHScope::Filter: { diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 5f58a64d8386c31..7905f3983f4ca33 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -432,6 +432,11 @@ static RawAddress createReferenceTemporary(CodeGenFunction &CGF, // FIXME: Should we put the new global into a COMDAT? return RawAddress(C, GV->getValueType(), alignment); } + if (CGF.IsSpawned) { + CGF.PushDetachScope(); + return CGF.CurDetachScope->CreateDetachedMemTemp( + Ty, M->getStorageDuration(), "det.ref.tmp"); + } return CGF.CreateMemTemp(Ty, "ref.tmp", Alloca); } case SD_Thread: @@ -530,6 +535,7 @@ EmitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *M) { EmitAnyExprToMem(E, Object, Qualifiers(), /*IsInit*/true); } } else { + if (!IsSpawned) { switch (M->getStorageDuration()) { case SD_Automatic: if (auto *Size = EmitLifetimeStart( @@ -584,6 +590,7 @@ EmitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *M) { default: break; } + } EmitAnyExprToMem(E, Object, Qualifiers(), /*IsInit*/true); } pushTemporaryCleanup(*this, M, E, Object); @@ -1589,7 +1596,12 @@ LValue CodeGenFunction::EmitLValueHelper(const Expr *E, case Expr::ExprWithCleanupsClass: { const auto *cleanups = cast(E); RunCleanupsScope Scope(*this); + bool CleanupsSaved = false; + if (IsSpawned) + CleanupsSaved = CurDetachScope->MaybeSaveCleanupsScope(&Scope); LValue LV = EmitLValue(cleanups->getSubExpr(), IsKnownNonNull); + if (CleanupsSaved) + CurDetachScope->CleanupDetach(); if (LV.isSimple()) { // Defend against branches out of gnu statement expressions surrounded by // cleanups. @@ -5816,6 +5828,16 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee assert(CalleeType->isFunctionPointerType() && "Call must have function pointer type!"); + if (IsSpawned) { + PushDetachScope(); + CurDetachScope->EnsureTaskFrame(); + } + + IsSpawnedScope SpawnedScp(this); + // RAII to finish detach scope after processing CallExpr E, if E uses a + // spawned value. + DetachScopeRAII DetScope(*this); + const Decl *TargetDecl = OrigCallee.getAbstractInfo().getCalleeDecl().getDecl(); @@ -6030,6 +6052,8 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee Address(Handle, Handle->getType(), CGM.getPointerAlign())); Callee.setFunctionPointer(Stub); } + + SpawnedScp.RestoreOldScope(); llvm::CallBase *CallOrInvoke = nullptr; RValue Call = EmitCall(FnInfo, Callee, ReturnValue, Args, &CallOrInvoke, E == MustTailCall, E->getExprLoc()); diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp index d9f44f4be617e54..6104c5089ee59f7 100644 --- a/clang/lib/CodeGen/CGExprAgg.cpp +++ b/clang/lib/CodeGen/CGExprAgg.cpp @@ -1243,6 +1243,12 @@ void AggExprEmitter::VisitBinAssign(const BinaryOperator *E) { EnsureDest(E->getRHS()->getType()); Visit(E->getRHS()); CGF.EmitAtomicStore(Dest.asRValue(), LHS, /*isInit*/ false); + if (CGF.IsSpawned) { + if (!(CGF.CurDetachScope && CGF.CurDetachScope->IsDetachStarted())) + CGF.FailedSpawnWarning(E->getRHS()->getExprLoc()); + CGF.IsSpawned = false; + CGF.PopDetachScope(); + } return; } @@ -1264,6 +1270,13 @@ void AggExprEmitter::VisitBinAssign(const BinaryOperator *E) { E->getType().isDestructedType() == QualType::DK_nontrivial_c_struct) CGF.pushDestroy(QualType::DK_nontrivial_c_struct, Dest.getAddress(), E->getType()); + + if (CGF.IsSpawned) { + if (!(CGF.CurDetachScope && CGF.CurDetachScope->IsDetachStarted())) + CGF.FailedSpawnWarning(E->getRHS()->getExprLoc()); + CGF.IsSpawned = false; + CGF.PopDetachScope(); + } } void AggExprEmitter:: @@ -1402,7 +1415,16 @@ AggExprEmitter::VisitLambdaExpr(LambdaExpr *E) { void AggExprEmitter::VisitExprWithCleanups(ExprWithCleanups *E) { CodeGenFunction::RunCleanupsScope cleanups(CGF); + // If this expression is spawned, associate these cleanups with the detach + // scope. + bool CleanupsSaved = false; + if (CGF.IsSpawned) + CleanupsSaved = CGF.CurDetachScope->MaybeSaveCleanupsScope(&cleanups); Visit(E->getSubExpr()); + // If this expression was spawned, then we must clean up the detach before + // forcing the scope's cleanup. + if (CleanupsSaved) + CGF.CurDetachScope->CleanupDetach(); } void AggExprEmitter::VisitCXXScalarValueInitExpr(CXXScalarValueInitExpr *E) { diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp index 8eb6ab7381acbc8..293be9123007c81 100644 --- a/clang/lib/CodeGen/CGExprCXX.cpp +++ b/clang/lib/CodeGen/CGExprCXX.cpp @@ -87,12 +87,19 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorCall( ReturnValueSlot ReturnValue, llvm::Value *This, llvm::Value *ImplicitParam, QualType ImplicitParamTy, const CallExpr *CE, CallArgList *RtlArgs) { + IsSpawnedScope SpawnedScp(this); const FunctionProtoType *FPT = MD->getType()->castAs(); CallArgList Args; + if (auto *OCE = dyn_cast_or_null(CE)) + if (OCE->isAssignmentOp()) + // Restore the original spawned scope when handling an assignment + // operator, so that the RHS of the assignment is detached. + SpawnedScp.RestoreOldScope(); MemberCallInfo CallInfo = commonEmitCXXMemberOrOperatorCall( *this, MD, This, ImplicitParam, ImplicitParamTy, CE, Args, RtlArgs); auto &FnInfo = CGM.getTypes().arrangeCXXMethodCall( Args, FPT, CallInfo.ReqArgs, CallInfo.PrefixSize); + SpawnedScp.RestoreOldScope(); return EmitCall(FnInfo, Callee, ReturnValue, Args, nullptr, CE && CE == MustTailCall, CE ? CE->getExprLoc() : SourceLocation()); @@ -218,6 +225,8 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr( const Expr *Base) { assert(isa(CE) || isa(CE)); + IsSpawnedScope SpawnedScp(this); + // Compute the object pointer. bool CanUseVirtualCall = MD->isVirtual() && !HasQualifier; @@ -263,7 +272,7 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr( CallArgList *RtlArgs = nullptr; LValue TrivialAssignmentRHS; if (auto *OCE = dyn_cast(CE)) { - if (OCE->isAssignmentOp()) { + if (OCE->isAssignmentOp() && !SpawnedScp.OldScopeIsSpawned()) { if (TrivialAssignment) { TrivialAssignmentRHS = EmitLValue(CE->getArg(1)); } else { @@ -291,6 +300,7 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr( // constructing a new complete object of type Ctor. assert(!RtlArgs); assert(ReturnValue.isNull() && "Constructor shouldn't have return value"); + SpawnedScp.RestoreOldScope(); CallArgList Args; commonEmitCXXMemberOrOperatorCall( *this, {Ctor, Ctor_Complete}, This.getPointer(*this), @@ -311,12 +321,19 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr( if (TrivialAssignment) { // We don't like to generate the trivial copy/move assignment operator // when it isn't necessary; just produce the proper effect here. + if (isa(CE) && SpawnedScp.OldScopeIsSpawned()) { + // Restore the original spawned scope so that the RHS of the assignment + // is detached. + SpawnedScp.RestoreOldScope(); + TrivialAssignmentRHS = EmitLValue(CE->getArg(1)); + } // It's important that we use the result of EmitLValue here rather than // emitting call arguments, in order to preserve TBAA information from // the RHS. LValue RHS = isa(CE) ? TrivialAssignmentRHS : EmitLValue(*CE->arg_begin()); + SpawnedScp.RestoreOldScope(); EmitAggregateAssign(This, RHS, CE->getType()); return RValue::get(This.getPointer(*this)); } @@ -374,6 +391,7 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr( "Destructor shouldn't have explicit parameters"); assert(ReturnValue.isNull() && "Destructor shouldn't have return value"); if (UseVirtualCall) { + SpawnedScp.RestoreOldScope(); CGM.getCXXABI().EmitVirtualDestructorCall(*this, Dtor, Dtor_Complete, This.getAddress(), cast(CE)); @@ -391,6 +409,7 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr( QualType ThisTy = IsArrow ? Base->getType()->getPointeeType() : Base->getType(); + SpawnedScp.RestoreOldScope(); EmitCXXDestructorCall(GD, Callee, This.getPointer(*this), ThisTy, /*ImplicitParam=*/nullptr, /*ImplicitParamTy=*/QualType(), CE); @@ -433,6 +452,7 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr( This.setAddress(NewThisAddr); } + SpawnedScp.RestoreOldScope(); return EmitCXXMemberOrOperatorCall( CalleeDecl, Callee, ReturnValue, This.getPointer(*this), /*ImplicitParam=*/nullptr, QualType(), CE, RtlArgs); @@ -441,6 +461,7 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr( RValue CodeGenFunction::EmitCXXMemberPointerCallExpr(const CXXMemberCallExpr *E, ReturnValueSlot ReturnValue) { + IsSpawnedScope SpawnedScp(this); const BinaryOperator *BO = cast(E->getCallee()->IgnoreParens()); const Expr *BaseExpr = BO->getLHS(); @@ -482,6 +503,7 @@ CodeGenFunction::EmitCXXMemberPointerCallExpr(const CXXMemberCallExpr *E, // And the rest of the call args EmitCallArgs(Args, FPT, E->arguments()); + SpawnedScp.RestoreOldScope(); return EmitCall(CGM.getTypes().arrangeCXXMethodCall(Args, FPT, required, /*PrefixSize=*/0), Callee, ReturnValue, Args, nullptr, E == MustTailCall, diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp index 4d45f6d64c1cd9b..0c99f267c370c2b 100644 --- a/clang/lib/CodeGen/CGExprComplex.cpp +++ b/clang/lib/CodeGen/CGExprComplex.cpp @@ -237,7 +237,16 @@ class ComplexExprEmitter } ComplexPairTy VisitExprWithCleanups(ExprWithCleanups *E) { CodeGenFunction::RunCleanupsScope Scope(CGF); + // If this expression is spawned, associate these cleanups with the detach + // scope. + bool CleanupsSaved = false; + if (CGF.IsSpawned) + CleanupsSaved = CGF.CurDetachScope->MaybeSaveCleanupsScope(&Scope); ComplexPairTy Vals = Visit(E->getSubExpr()); + // If this expression was spawned, then we must clean up the detach before + // forcing the scope's cleanup. + if (CleanupsSaved) + CGF.CurDetachScope->CleanupDetach(); // Defend against dominance problems caused by jumps out of expression // evaluation through the shared cleanup block. Scope.ForceCleanup({&Vals.first, &Vals.second}); diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 6e212e74676e8d9..3a2addc3106e18f 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -231,6 +231,8 @@ class ScalarExprEmitter CodeGenFunction &CGF; CGBuilderTy &Builder; bool IgnoreResultAssign; + bool DoSpawnedInit = false; + LValue LValueToSpawnInit; llvm::LLVMContext &VMContext; public: @@ -239,6 +241,13 @@ class ScalarExprEmitter VMContext(cgf.getLLVMContext()) { } + ScalarExprEmitter(CodeGenFunction &cgf, LValue LValueToSpawnInit, + bool ira=false) + : CGF(cgf), Builder(CGF.Builder), IgnoreResultAssign(ira), + DoSpawnedInit(true), LValueToSpawnInit(LValueToSpawnInit), + VMContext(cgf.getLLVMContext()) { + } + //===--------------------------------------------------------------------===// // Utilities //===--------------------------------------------------------------------===// @@ -2719,7 +2728,16 @@ Value *ScalarExprEmitter::VisitStmtExpr(const StmtExpr *E) { Value *ScalarExprEmitter::VisitExprWithCleanups(ExprWithCleanups *E) { CodeGenFunction::RunCleanupsScope Scope(CGF); + // If this expression is spawned, associate these cleanups with the detach + // scope. + bool CleanupsSaved = false; + if (CGF.IsSpawned) + CleanupsSaved = CGF.CurDetachScope->MaybeSaveCleanupsScope(&Scope); Value *V = Visit(E->getSubExpr()); + // If this expression was spawned, then we must clean up the detach before + // forcing the scope's cleanup. + if (CleanupsSaved) + CGF.CurDetachScope->CleanupDetach(); // Defend against dominance problems caused by jumps out of expression // evaluation through the shared cleanup block. Scope.ForceCleanup({&V}); @@ -5501,6 +5519,16 @@ Value *CodeGenFunction::EmitScalarExpr(const Expr *E, bool IgnoreResultAssign) { .Visit(const_cast(E)); } +void CodeGenFunction::EmitScalarExprIntoLValue(const Expr *E, LValue dest, + bool isInit) { + assert(E && hasScalarEvaluationKind(E->getType()) && + "Invalid scalar expression to emit"); + + Value *V = ScalarExprEmitter(*this).Visit(const_cast(E)); + EmitNullabilityCheck(dest, V, E->getExprLoc()); + EmitStoreThroughLValue(RValue::get(V), dest, isInit); +} + /// Emit a conversion from the specified type to the specified destination type, /// both of which are LLVM scalar types. Value *CodeGenFunction::EmitScalarConversion(Value *Src, QualType SrcTy, diff --git a/clang/lib/CodeGen/CGLoopInfo.cpp b/clang/lib/CodeGen/CGLoopInfo.cpp index 6b886bd6b6d2cf1..672d292a9210636 100644 --- a/clang/lib/CodeGen/CGLoopInfo.cpp +++ b/clang/lib/CodeGen/CGLoopInfo.cpp @@ -460,8 +460,10 @@ LoopAttributes::LoopAttributes(bool IsParallel) VectorizePredicateEnable(LoopAttributes::Unspecified), VectorizeWidth(0), VectorizeScalable(LoopAttributes::Unspecified), InterleaveCount(0), UnrollCount(0), UnrollAndJamCount(0), + TapirGrainsize(0), DistributeEnable(LoopAttributes::Unspecified), PipelineDisabled(false), - PipelineInitiationInterval(0), CodeAlign(0), MustProgress(false) {} + PipelineInitiationInterval(0), CodeAlign(0), MustProgress(false), + SpawnStrategy(LoopAttributes::Sequential) {} void LoopAttributes::clear() { IsParallel = false; @@ -470,6 +472,7 @@ void LoopAttributes::clear() { InterleaveCount = 0; UnrollCount = 0; UnrollAndJamCount = 0; + TapirGrainsize = 0; VectorizeEnable = LoopAttributes::Unspecified; UnrollEnable = LoopAttributes::Unspecified; UnrollAndJamEnable = LoopAttributes::Unspecified; @@ -479,6 +482,7 @@ void LoopAttributes::clear() { PipelineInitiationInterval = 0; CodeAlign = 0; MustProgress = false; + SpawnStrategy = LoopAttributes::Sequential; } LoopInfo::LoopInfo(BasicBlock *Header, const LoopAttributes &Attrs, @@ -496,6 +500,7 @@ LoopInfo::LoopInfo(BasicBlock *Header, const LoopAttributes &Attrs, if (!Attrs.IsParallel && Attrs.VectorizeWidth == 0 && Attrs.VectorizeScalable == LoopAttributes::Unspecified && Attrs.InterleaveCount == 0 && Attrs.UnrollCount == 0 && + Attrs.TapirGrainsize == 0 && Attrs.UnrollAndJamCount == 0 && !Attrs.PipelineDisabled && Attrs.PipelineInitiationInterval == 0 && Attrs.VectorizePredicateEnable == LoopAttributes::Unspecified && @@ -503,12 +508,39 @@ LoopInfo::LoopInfo(BasicBlock *Header, const LoopAttributes &Attrs, Attrs.UnrollEnable == LoopAttributes::Unspecified && Attrs.UnrollAndJamEnable == LoopAttributes::Unspecified && Attrs.DistributeEnable == LoopAttributes::Unspecified && - Attrs.CodeAlign == 0 && !StartLoc && !EndLoc && !Attrs.MustProgress) + Attrs.CodeAlign == 0 && !StartLoc && !EndLoc && !Attrs.MustProgress && + Attrs.SpawnStrategy == LoopAttributes::Sequential) return; TempLoopID = MDNode::getTemporary(Header->getContext(), std::nullopt); } +void LoopInfo::getTapirLoopProperties( + const LoopAttributes &Attrs, SmallVectorImpl &LoopProperties) { + LLVMContext &Ctx = Header->getContext(); + + if (Attrs.SpawnStrategy == LoopAttributes::Sequential) + return; + + // Setting tapir.loop.spawn.strategy + if (Attrs.SpawnStrategy != LoopAttributes::Sequential) { + Metadata *Vals[] = { + MDString::get(Ctx, "tapir.loop.spawn.strategy"), + ConstantAsMetadata::get(ConstantInt::get(llvm::Type::getInt32Ty(Ctx), + Attrs.SpawnStrategy))}; + LoopProperties.push_back(MDNode::get(Ctx, Vals)); + } + + // Setting tapir.loop.grainsize + if (Attrs.TapirGrainsize > 0) { + Metadata *Vals[] = { + MDString::get(Ctx, "tapir.loop.grainsize"), + ConstantAsMetadata::get(ConstantInt::get(llvm::Type::getInt32Ty(Ctx), + Attrs.TapirGrainsize))}; + LoopProperties.push_back(MDNode::get(Ctx, Vals)); + } +} + void LoopInfo::finish() { // We did not annotate the loop body instructions because there are no // attributes for this loop. @@ -588,8 +620,11 @@ void LoopInfo::finish() { CurLoopAttr = BeforeJam; } + SmallVector TapirLoopProperties; + getTapirLoopProperties(CurLoopAttr, TapirLoopProperties); + bool HasUserTransforms = false; - LoopID = createMetadata(CurLoopAttr, {}, HasUserTransforms); + LoopID = createMetadata(CurLoopAttr, TapirLoopProperties, HasUserTransforms); TempLoopID->replaceAllUsesWith(LoopID); } @@ -688,6 +723,7 @@ void LoopInfoStack::push(BasicBlock *Header, clang::ASTContext &Ctx, case LoopHintAttr::VectorizeWidth: case LoopHintAttr::InterleaveCount: case LoopHintAttr::PipelineInitiationInterval: + case LoopHintAttr::TapirGrainsize: llvm_unreachable("Options cannot be disabled."); break; } @@ -716,6 +752,7 @@ void LoopInfoStack::push(BasicBlock *Header, clang::ASTContext &Ctx, case LoopHintAttr::InterleaveCount: case LoopHintAttr::PipelineDisabled: case LoopHintAttr::PipelineInitiationInterval: + case LoopHintAttr::TapirGrainsize: llvm_unreachable("Options cannot enabled."); break; } @@ -738,6 +775,7 @@ void LoopInfoStack::push(BasicBlock *Header, clang::ASTContext &Ctx, case LoopHintAttr::Distribute: case LoopHintAttr::PipelineDisabled: case LoopHintAttr::PipelineInitiationInterval: + case LoopHintAttr::TapirGrainsize: llvm_unreachable("Options cannot be used to assume mem safety."); break; } @@ -760,6 +798,7 @@ void LoopInfoStack::push(BasicBlock *Header, clang::ASTContext &Ctx, case LoopHintAttr::PipelineDisabled: case LoopHintAttr::PipelineInitiationInterval: case LoopHintAttr::VectorizePredicate: + case LoopHintAttr::TapirGrainsize: llvm_unreachable("Options cannot be used with 'full' hint."); break; } @@ -793,6 +832,9 @@ void LoopInfoStack::push(BasicBlock *Header, clang::ASTContext &Ctx, case LoopHintAttr::PipelineInitiationInterval: setPipelineInitiationInterval(ValueInt); break; + case LoopHintAttr::TapirGrainsize: + setTapirGrainsize(ValueInt); + break; case LoopHintAttr::Unroll: case LoopHintAttr::UnrollAndJam: case LoopHintAttr::VectorizePredicate: diff --git a/clang/lib/CodeGen/CGLoopInfo.h b/clang/lib/CodeGen/CGLoopInfo.h index 0fe33b289130635..208cc9d57617dd3 100644 --- a/clang/lib/CodeGen/CGLoopInfo.h +++ b/clang/lib/CodeGen/CGLoopInfo.h @@ -70,6 +70,9 @@ struct LoopAttributes { /// llvm.unroll. unsigned UnrollAndJamCount; + /// tapir.loop.grainsize. + unsigned TapirGrainsize; + /// Value for llvm.loop.distribute.enable metadata. LVEnableState DistributeEnable; @@ -84,6 +87,12 @@ struct LoopAttributes { /// Value for whether the loop is required to make progress. bool MustProgress; + + /// Tapir-loop spawning strategy. + enum LSStrategy { Sequential, DAC }; + + /// Value for tapir.loop.spawn.strategy metadata. + LSStrategy SpawnStrategy; }; /// Information used when generating a structured loop. @@ -179,6 +188,9 @@ class LoopInfo { createFullUnrollMetadata(const LoopAttributes &Attrs, llvm::ArrayRef LoopProperties, bool &HasUserTransforms); + void getTapirLoopProperties( + const LoopAttributes &Attrs, + llvm::SmallVectorImpl &LoopProperties); /// @} /// Create a LoopID for this loop, including transformation-unspecific @@ -295,6 +307,15 @@ class LoopInfoStack { /// Set no progress for the next loop pushed. void setMustProgress(bool P) { StagedAttrs.MustProgress = P; } + /// Set the Tapir-loop spawning strategy for the next loop pushed. + void setSpawnStrategy(const LoopAttributes::LSStrategy &Strat) { + StagedAttrs.SpawnStrategy = Strat; + } + + /// Set the Tapir-loop grainsize for the next loop pushed. + void setTapirGrainsize(unsigned C) { StagedAttrs.TapirGrainsize = C; } + +private: /// Returns true if there is LoopInfo on the stack. bool hasInfo() const { return !Active.empty(); } /// Return the LoopInfo for the current loop. HasInfo should be called diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 2f466602d2f680d..bd7452a63249fb8 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -507,6 +507,7 @@ Address CodeGenFunction::EmitCompoundStmt(const CompoundStmt &S, bool GetLast, // Keep track of the current cleanup stack depth, including debug scopes. LexicalScope Scope(*this, S.getSourceRange()); + SyncRegionRAII StmtSR(*this); return EmitCompoundStmtWithoutScope(S, GetLast, AggSlot); } @@ -1496,6 +1497,9 @@ void CodeGenFunction::EmitReturnStmt(const ReturnStmt &S) { SaveRetExprRAII SaveRetExpr(RV, *this); RunCleanupsScope cleanupScope(*this); + bool CleanupsSaved = false; + if (IsSpawned) + CleanupsSaved = CurDetachScope->MaybeSaveCleanupsScope(&cleanupScope); if (const auto *EWC = dyn_cast_or_null(RV)) RV = EWC->getSubExpr(); @@ -1571,8 +1575,21 @@ void CodeGenFunction::EmitReturnStmt(const ReturnStmt &S) { if (!RV || RV->isEvaluatable(getContext())) ++NumSimpleReturnExprs; + if (CleanupsSaved) + CurDetachScope->CleanupDetach(); cleanupScope.ForceCleanup(); - EmitBranchThroughCleanup(ReturnBlock); + if (IsSpawned) { + if (!(CurDetachScope && CurDetachScope->IsDetachStarted())) + FailedSpawnWarning(RV->getExprLoc()); + // Pop the detach scope + IsSpawned = false; + PopDetachScope(); + } + + // FIXME KITSUNE: Can we clean up this API since we know that we will never be + // compiling Cilk? + bool CompilingCilk = false; + EmitBranchThroughCleanup(ReturnBlock, CompilingCilk); } void CodeGenFunction::EmitDeclStmt(const DeclStmt &S) { diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt index 2a179deddcc3143..33a527ca61361eb 100644 --- a/clang/lib/CodeGen/CMakeLists.txt +++ b/clang/lib/CodeGen/CMakeLists.txt @@ -61,6 +61,7 @@ add_clang_library(clangCodeGen CGAtomic.cpp CGBlocks.cpp CGBuiltin.cpp + CGCilk.cpp CGCUDANV.cpp CGCUDARuntime.cpp CGCXX.cpp diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 2b2e23f1e5d7fb6..a3fe336b8967630 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -365,6 +365,8 @@ static void EmitIfUsed(CodeGenFunction &CGF, llvm::BasicBlock *BB) { void CodeGenFunction::FinishFunction(SourceLocation EndLoc) { assert(BreakContinueStack.empty() && "mismatched push/pop in break/continue stack!"); + assert(!CurDetachScope && + "mismatched push/pop in detach-scope stack!"); assert(LifetimeExtendedCleanupStack.empty() && "mismatched push/pop of cleanups in EHStack!"); assert(DeferredDeactivationCleanupStack.empty() && @@ -406,6 +408,11 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) { bool HasOnlyLifetimeMarkers = HasCleanups && EHStack.containsOnlyLifetimeMarkers(PrologueCleanupDepth); bool EmitRetDbgLoc = !HasCleanups || HasOnlyLifetimeMarkers; + bool SyncEmitted = false; + + // FIXME KITSUNE: Since we know that we will never be compiling Cilk, can we + // simplify this? + bool CompilingCilk = false; std::optional OAL; if (HasCleanups) { @@ -420,12 +427,29 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) { OAL = ApplyDebugLocation::CreateDefaultArtificial(*this, EndLoc); } - PopCleanupBlocks(PrologueCleanupDepth); + // If we're compiling Cilk, PopCleanupBlocks should emit a _Cilk_sync before + // any cleanups. + PopCleanupBlocks(PrologueCleanupDepth, {}, CompilingCilk); + SyncEmitted = true; + } else if (CompilingCilk && Builder.GetInsertBlock() && + ReturnBlock.getBlock()->use_empty()) { + // If we're compiling Cilk, emit an implicit sync for the function. In this + // case, EmitReturnBlock will recycle Builder.GetInsertBlock() for the + // function's return block, so we insert the implicit _Cilk_sync before + // calling EmitReturnBlock. + EmitImplicitSyncCleanup(); + SyncEmitted = true; } // Emit function epilog (to return). llvm::DebugLoc Loc = EmitReturnBlock(); + if (CompilingCilk && !SyncEmitted) { + // If we're compiling Cilk, emit an implicit sync for the function. + EmitImplicitSyncCleanup(); + SyncEmitted = true; + } + if (ShouldInstrumentFunction()) { if (CGM.getCodeGenOpts().InstrumentFunctions) CurFn->addFnAttr("instrument-function-exit", "__cyg_profile_func_exit"); @@ -571,6 +595,11 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) { ReturnValue = Address::invalid(); } } + + if (CurSyncRegion) { + PopSyncRegion(); + assert(!CurSyncRegion && "Nested sync regions at end of function."); + } } /// ShouldInstrumentFunction - Return true if the current function should be diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 60e6841e1b3d695..0e9717486716adf 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -848,6 +848,18 @@ class CodeGenFunction : public CodeGenTypeCache { /// we're currently inside a conditionally-evaluated expression. template void pushFullExprCleanup(CleanupKind kind, As... A) { + if (SpawnedCleanup) { + if (kind & EHCleanup) + pushFullExprCleanupImpl( + static_cast(kind & ~NormalCleanup), A...); + pushCleanupAfterFullExpr(kind, A...); + return; + } + pushFullExprCleanupImpl(kind, A...); + } + + template + void pushFullExprCleanupImpl(CleanupKind kind, As... A) { // If we're not in a conditional branch, or if none of the // arguments requires saving, then use the unconditional cleanup. if (!isInConditionalBranch()) @@ -935,10 +947,14 @@ class CodeGenFunction : public CodeGenTypeCache { void PushDestructorCleanup(const CXXDestructorDecl *Dtor, QualType T, Address Addr); + /// EmitImplicitSyncCleanup - Emit an implicit sync. + void EmitImplicitSyncCleanup(llvm::Instruction *SyncRegion = nullptr); + /// PopCleanupBlock - Will pop the cleanup entry on the stack and /// process all branch fixups. void PopCleanupBlock(bool FallThroughIsBranchThrough = false, - bool ForDeactivation = false); + bool ForDeactivation = false, + bool AfterSync = false); /// DeactivateCleanupBlock - Deactivates the given cleanup block. /// The block cannot be reactivated. Pops it if it's the top of the @@ -970,6 +986,10 @@ class CodeGenFunction : public CodeGenTypeCache { bool OldDidCallStackSave; protected: bool PerformCleanup; + bool CleanupAfterSync; + /// Protected method to control whether a sync is inserted before any + /// cleanups. + void setCleanupAfterSync(bool V = true) { CleanupAfterSync = V; } private: RunCleanupsScope(const RunCleanupsScope &) = delete; @@ -981,7 +1001,8 @@ class CodeGenFunction : public CodeGenTypeCache { public: /// Enter a new cleanup scope. explicit RunCleanupsScope(CodeGenFunction &CGF) - : DeactivateCleanups(CGF), PerformCleanup(true), CGF(CGF) { + : DeactivateCleanups(CGF), PerformCleanup(true), + CleanupAfterSync(false), CGF(CGF) { CleanupStackDepth = CGF.EHStack.stable_begin(); LifetimeExtendedCleanupStackSize = CGF.LifetimeExtendedCleanupStack.size(); @@ -1013,10 +1034,64 @@ class CodeGenFunction : public CodeGenTypeCache { CGF.DidCallStackSave = OldDidCallStackSave; DeactivateCleanups.ForceDeactivate(); CGF.PopCleanupBlocks(CleanupStackDepth, LifetimeExtendedCleanupStackSize, - ValuesToReload); + ValuesToReload, CleanupAfterSync); PerformCleanup = false; CGF.CurrentCleanupScopeDepth = OldCleanupScopeDepth; } + + /// Pops cleanup blocks until the given savepoint is reached, then add the + /// cleanups from the given savepoint in the lifetime-extended cleanups + /// stack. + void PopCleanupBlocksAndDetach( + std::initializer_list ValuesToReload) { + size_t OldLifetimeExtendedSize = LifetimeExtendedCleanupStackSize; + CGF.PopCleanupBlocks(CleanupStackDepth, ValuesToReload); + + // Do the detach, and get the new cleanup stack depth. + CGF.CurDetachScope->PushSpawnedTaskTerminate(); + CleanupStackDepth = CGF.EHStack.stable_begin(); + + // Move our deferred cleanups onto the EH stack. This scope will deal + // with these deferred cleanups when it is destroyed. + for (size_t I = OldLifetimeExtendedSize, + E = CGF.LifetimeExtendedCleanupStack.size(); I != E; /**/) { + // Alignment should be guaranteed by the vptrs in the individual cleanups. + assert((I % alignof(LifetimeExtendedCleanupHeader) == 0) && + "misaligned cleanup stack entry"); + + LifetimeExtendedCleanupHeader &Header = + reinterpret_cast( + CGF.LifetimeExtendedCleanupStack[I]); + I += sizeof(Header); + + CGF.EHStack.pushCopyOfCleanup(Header.getKind(), + &CGF.LifetimeExtendedCleanupStack[I], + Header.getSize()); + I += Header.getSize(); + + if (Header.isConditional()) { + Address ActiveFlag = + reinterpret_cast
(CGF.LifetimeExtendedCleanupStack[I]); + CGF.initFullExprCleanupWithFlag(ActiveFlag); + I += sizeof(ActiveFlag); + } + } + CGF.LifetimeExtendedCleanupStack.resize(OldLifetimeExtendedSize); + } + + void DoDetach(std::initializer_list ValuesToReload = {}) { + IsSpawnedScope SpawnedScp(&CGF); + CGF.DidCallStackSave = OldDidCallStackSave; + + PopCleanupBlocksAndDetach(ValuesToReload); + + LifetimeExtendedCleanupStackSize = + CGF.LifetimeExtendedCleanupStack.size(); + OldDidCallStackSave = CGF.DidCallStackSave; + CGF.DidCallStackSave = false; + OldCleanupScopeDepth = CGF.CurrentCleanupScopeDepth; + CGF.CurrentCleanupScopeDepth = CleanupStackDepth; + } }; // Cleanup stack depth of the RunCleanupsScope that was pushed most recently. @@ -1223,11 +1298,497 @@ class CodeGenFunction : public CodeGenTypeCache { ~OMPLocalDeclMapRAII() { SavedMap.swap(CGF.LocalDeclMap); } }; + /// In Cilk, flag indicating whether the current call/invoke is spawned. + bool IsSpawned = false; + bool SpawnedCleanup = false; + + /// RAII object to set/unset CodeGenFunction::IsSpawned. + class IsSpawnedScope { + CodeGenFunction *CGF; + bool OldIsSpawned; + bool OldSpawnedCleanup; + public: + IsSpawnedScope(CodeGenFunction *CGF); + ~IsSpawnedScope(); + bool OldScopeIsSpawned() const; + void RestoreOldScope(); + }; + + /// Cleanup to ensure a sync is inserted. If no SyncRegion is specified, then + /// this cleanup actually serves as a placeholder in EHStack, which ensures + /// that an implicit sync is inserted before any normal cleanups. + struct ImplicitSyncCleanup final : public EHScopeStack::Cleanup { + llvm::Instruction *SyncRegion; + public: + ImplicitSyncCleanup(llvm::Instruction *SyncRegion = nullptr) + : SyncRegion(SyncRegion) {} + + void Emit(CodeGenFunction &CGF, Flags F) override { + if (SyncRegion) + CGF.EmitImplicitSyncCleanup(SyncRegion); + } + }; + + // Flag indicating whether CodeGen is currently emitting within a some + // _Cilk_scope. + bool WithinCilkScope = false; + + /// Cleanup to ensure a tapir.runtime.end intrinsic is inserted. + struct TapirRuntimeEndCleanup final : public EHScopeStack::Cleanup { + llvm::Instruction *TapirRuntimeStart; + + public: + TapirRuntimeEndCleanup(llvm::Instruction *TapirRuntimeStart) + : TapirRuntimeStart(TapirRuntimeStart) {} + + void Emit(CodeGenFunction &CGF, Flags F) override { + CGF.Builder.CreateCall( + CGF.CGM.getIntrinsic(llvm::Intrinsic::tapir_runtime_end), + {TapirRuntimeStart}); + } + }; + + // Subclass of RunCleanupsScope that ensures an implicit sync is emitted + // before cleanups. + class ImplicitSyncScope : public RunCleanupsScope { + ImplicitSyncScope(const ImplicitSyncScope &) = delete; + void operator=(const ImplicitSyncScope &) = delete; + public: + explicit ImplicitSyncScope(CodeGenFunction &CGF) : RunCleanupsScope(CGF) { + setCleanupAfterSync(); + CGF.EHStack.pushCleanup(NormalCleanup); + } + + ~ImplicitSyncScope() { + if (PerformCleanup) + ForceCleanup(); + } + + void ForceCleanup() { + RunCleanupsScope::ForceCleanup(); + } + }; + + /// A sync region is a collection of spawned tasks and syncs such that syncs + /// in the collection may wait on the spawned tasks in the same collection + /// (control-flow permitting). In Cilk, certain constructs, such as functions + /// _Cilk_spawn bodies, or _Cilk_for loop bodies, use a separate sync region + /// to handle spawning and syncing of tasks within that construct. + class SyncRegion { + CodeGenFunction &CGF; + SyncRegion *ParentRegion; + llvm::Instruction *SyncRegionStart = nullptr; + ImplicitSyncScope *InnerSyncScope = nullptr; + + SyncRegion(const SyncRegion &) = delete; + void operator=(const SyncRegion &) = delete; + public: + explicit SyncRegion(CodeGenFunction &CGF) + : CGF(CGF), ParentRegion(CGF.CurSyncRegion) {} + + ~SyncRegion() { + if (InnerSyncScope) + delete InnerSyncScope; + CGF.CurSyncRegion = ParentRegion; + } + + llvm::Instruction *getSyncRegionStart() const { + return SyncRegionStart; + } + void setSyncRegionStart(llvm::Instruction *SRStart) { + SyncRegionStart = SRStart; + } + + void addImplicitSync() { + if (!InnerSyncScope) + InnerSyncScope = new ImplicitSyncScope(CGF); + } + }; + + /// The current sync region. + SyncRegion *CurSyncRegion = nullptr; + + SyncRegion *PushSyncRegion() { + CurSyncRegion = new SyncRegion(*this); + return CurSyncRegion; + } + + llvm::Instruction *EmitSyncRegionStart(); + + void PopSyncRegion() { + delete CurSyncRegion; // ~SyncRegion updates CurSyncRegion + } + + void EnsureSyncRegion() { + if (!CurSyncRegion) + PushSyncRegion(); + if (!CurSyncRegion->getSyncRegionStart()) + CurSyncRegion->setSyncRegionStart(EmitSyncRegionStart()); + } + + // Flag to indicate whether the current scope is synced. Currently this flag + // is used to optionally push a SyncRegion inside of a lexical scope, so that + // any cleanups run within that lexical scope occur after an implicit sync. + bool ScopeIsSynced = false; + + // RAII for maintaining CodeGenFunction::ScopeIsSynced. + class SyncedScopeRAII { + CodeGenFunction &CGF; + bool OldScopeIsSynced; + public: + SyncedScopeRAII(CodeGenFunction &CGF) + : CGF(CGF), OldScopeIsSynced(CGF.ScopeIsSynced) {} + ~SyncedScopeRAII() { CGF.ScopeIsSynced = OldScopeIsSynced; } + }; + + // RAII for pushing and popping a sync region. + class SyncRegionRAII { + CodeGenFunction &CGF; + bool OldScopeIsSynced; + public: + SyncRegionRAII(CodeGenFunction &CGF, bool addImplicitSync = true) + : CGF(CGF), OldScopeIsSynced(CGF.ScopeIsSynced) { + if (CGF.ScopeIsSynced) { + CGF.PushSyncRegion(); + // If requested, add an implicit sync onto this sync region. + if (addImplicitSync) + CGF.CurSyncRegion->addImplicitSync(); + + CGF.ScopeIsSynced = false; + } + } + ~SyncRegionRAII() { + if (OldScopeIsSynced) { + CGF.PopSyncRegion(); + CGF.ScopeIsSynced = OldScopeIsSynced; + } + } + }; + + /// Cleanup to ensure a taskframe is ended with a taskframe.resume on an + /// exception-handling path. + struct CallTaskEnd final : public EHScopeStack::Cleanup { + llvm::Value *TaskFrame; + public: + CallTaskEnd(llvm::Value *TaskFrame) : TaskFrame(TaskFrame) {} + void Emit(CodeGenFunction &CGF, Flags F) override { + // Recreate the landingpad's return value for the rethrow invoke. Tapir + // lowering will replace this rethrow with a resume. + llvm::Value *Exn = CGF.Builder.CreateLoad( + Address(CGF.ExceptionSlot, CGF.Int8PtrTy, CGF.getPointerAlign()), + "exn"); + llvm::Value *Sel = CGF.Builder.CreateLoad( + Address(CGF.EHSelectorSlot, CGF.Int32Ty, CharUnits::fromQuantity(4)), + "sel"); + llvm::Type *LPadType = + llvm::StructType::get(Exn->getType(), Sel->getType()); + llvm::Value *LPadVal = llvm::UndefValue::get(LPadType); + LPadVal = CGF.Builder.CreateInsertValue(LPadVal, Exn, 0, "lpad.val"); + LPadVal = CGF.Builder.CreateInsertValue(LPadVal, Sel, 1, "lpad.val"); + + llvm::Function *TaskFrameResume = + CGF.CGM.getIntrinsic(llvm::Intrinsic::taskframe_resume, + { LPadVal->getType() }); + CGF.Builder.CreateInvoke(TaskFrameResume, CGF.getUnreachableBlock(), + CGF.CurDetachScope->getTempInvokeDest(), + { TaskFrame, LPadVal }); + CGF.Builder.SetInsertPoint(CGF.CurDetachScope->getTempInvokeDest()); + } + }; + + /// Cleanup to ensure spawned task is ended with a detached.rethrow on an + /// exception-handling path. + struct CallDetRethrow final : public EHScopeStack::Cleanup { + llvm::Value *SyncRegion; + llvm::BasicBlock *TempInvokeDest; + public: + CallDetRethrow(llvm::Value *SyncRegion, + llvm::BasicBlock *TempInvokeDest = nullptr) + : SyncRegion(SyncRegion), TempInvokeDest(TempInvokeDest) {} + void Emit(CodeGenFunction &CGF, Flags F) override { + if (!TempInvokeDest) + TempInvokeDest = CGF.CurDetachScope->getTempInvokeDest(); + + // Recreate the landingpad's return value for the rethrow invoke. Tapir + // lowering will replace this rethrow with a resume. + llvm::Value *Exn = CGF.Builder.CreateLoad( + Address(CGF.ExceptionSlot, CGF.Int8PtrTy, CGF.getPointerAlign()), + "exn"); + llvm::Value *Sel = CGF.Builder.CreateLoad( + Address(CGF.EHSelectorSlot, CGF.Int32Ty, CharUnits::fromQuantity(4)), + "sel"); + llvm::Type *LPadType = + llvm::StructType::get(Exn->getType(), Sel->getType()); + llvm::Value *LPadVal = llvm::UndefValue::get(LPadType); + LPadVal = CGF.Builder.CreateInsertValue(LPadVal, Exn, 0, "lpad.val"); + LPadVal = CGF.Builder.CreateInsertValue(LPadVal, Sel, 1, "lpad.val"); + + llvm::Function *DetachedRethrow = + CGF.CGM.getIntrinsic(llvm::Intrinsic::detached_rethrow, + { LPadVal->getType() }); + CGF.Builder.CreateInvoke(DetachedRethrow, CGF.getUnreachableBlock(), + TempInvokeDest, { SyncRegion, LPadVal }); + CGF.Builder.SetInsertPoint(TempInvokeDest); + } + }; + + /// Object to manage creation of spawned tasks using Tapir instructions. + /// + /// Conceptually, each spawned task corresponds to a detach scope, which gets + /// its own copy of specific CodeGenFunction state, such as its own alloca + /// insert point and exception-handling state. In practice, detach scopes + /// maintain two scopes for each spawned task: a scope corresponding with the + /// taskframe of the task, and a scope for the task itself. + class DetachScope { + CodeGenFunction &CGF; + bool DetachStarted = false; + bool DetachCleanedUp = false; + llvm::DetachInst *Detach = nullptr; + llvm::BasicBlock *DetachedBlock = nullptr; + llvm::BasicBlock *ContinueBlock = nullptr; + + // Pointer to the parent detach scope. + DetachScope *ParentScope; + + // Possible cleanup scope from a child ExprWithCleanups of a CilkSpawnStmt. + // We keep track of this scope in order to properly adjust the scope when + // the emission of the task itself injects an additional cleanup onto + // EHStack. + RunCleanupsScope *StmtCleanupsScope = nullptr; + + // Old alloca insertion points from the CGF to restore when we're done + // emitting the spawned task and associated taskframe. + llvm::AssertingVH OldAllocaInsertPt = nullptr; + // Alloca insertion point for the taskframe, which we save and restore + // around the emission of the spawned task itself. + llvm::AssertingVH TFAllocaInsertPt = nullptr; + // A temporary invoke destination, maintained to handle the emission of + // detached.rethrow and taskframe.resume intrinsics on exception-handling + // paths out of a spawned task or its taskframe. + llvm::BasicBlock *TempInvokeDest = nullptr; + + // Old EH state from the CGF to restore when we're done emitting the spawned + // task and associated taskframe. + llvm::BasicBlock *OldEHResumeBlock = nullptr; + llvm::Value *OldExceptionSlot = nullptr; + llvm::AllocaInst *OldEHSelectorSlot = nullptr; + Address OldNormalCleanupDest = Address::invalid(); + + // EH state for the taskframe, which we save and restore around the emission + // of the spawned task itself. + llvm::BasicBlock *TFEHResumeBlock = nullptr; + llvm::Value *TFExceptionSlot = nullptr; + llvm::AllocaInst *TFEHSelectorSlot = nullptr; + Address TFNormalCleanupDest = Address::invalid(); + + // Saved state in an initialized detach scope. + llvm::AssertingVH SavedDetachedAllocaInsertPt = nullptr; + + // Information about a reference temporary created early in the detached + // block. + Address RefTmp = Address::invalid(); + StorageDuration RefTmpSD; + + // Optional taskframe created separately from detach. + llvm::Value *TaskFrame = nullptr; + + void InitDetachScope(); + + DetachScope(const DetachScope &) = delete; + void operator=(const DetachScope &) = delete; + + public: + /// Enter a new detach scope + explicit DetachScope(CodeGenFunction &CGF) + : CGF(CGF), ParentScope(CGF.CurDetachScope) { + CGF.CurDetachScope = this; + EnsureTaskFrame(); + } + + /// Exit this detach scope. + ~DetachScope() { + if (TempInvokeDest && TempInvokeDest->use_empty()) + delete TempInvokeDest; + CGF.CurDetachScope = ParentScope; + } + + // Optionally save the specified cleanups scope, so it can be properly + // updated when a spawned task is emitted. + bool MaybeSaveCleanupsScope(RunCleanupsScope *Scope) { + if (!StmtCleanupsScope) { + StmtCleanupsScope = Scope; + return true; + } + return false; + } + + // Methods to handle the taskframe associated with the spawned task. + void EnsureTaskFrame(); + llvm::Value *GetTaskFrame() { return TaskFrame; } + + // Create nested exception-handling state for a taskframe or spawned task. + void CreateTaskFrameEHState(); + void CreateDetachedEHState(); + // Restore ancestor exception-handling state of a spawned task or taskframe. + // Returns a pointer to any EHResumeBlock that was generated during the + // emission of the spawned task or taskframe. + llvm::BasicBlock *RestoreTaskFrameEHState(); + llvm::BasicBlock *RestoreParentEHState(); + + // Get a temporary destination for an invoke, creating a new one if + // necessary. + llvm::BasicBlock *getTempInvokeDest() { + if (!TempInvokeDest) + TempInvokeDest = CGF.createBasicBlock("temp.invoke.dest"); + return TempInvokeDest; + } + + // Start the spawned task, i.e., by emitting a detach instruction and + // setting up nested CGF state. + void StartDetach(); + // Returns true if the spawned task has started. + bool IsDetachStarted() const { return DetachStarted; } + // Push a terminator for the spawned task onto EHStack. + void PushSpawnedTaskTerminate(); + // Clean up state for the spawned task. + void CleanupDetach(); + // Emit the end of the spawned task, i.e., a reattach. + void EmitTaskEnd(); + // Finish the spawned task. + void FinishDetach(); + + // Create a temporary for the spawned task, specifically, before the spawned + // task has started. + Address CreateDetachedMemTemp(QualType Ty, StorageDuration SD, + const Twine &Name = "det.tmp"); + }; + + /// The current detach scope. + DetachScope *CurDetachScope = nullptr; + + /// Push a new detach scope onto the stack, but do not begin the detach. + void PushDetachScope() { + EnsureSyncRegion(); + if (!CurDetachScope || CurDetachScope->IsDetachStarted()) + CurDetachScope = new DetachScope(*this); + } + + /// Finish the current detach scope and pop it off the stack. + void PopDetachScope() { + CurDetachScope->FinishDetach(); + delete CurDetachScope; + } + + /// Produce a warning that we failed to emit a spawn. + void FailedSpawnWarning(SourceLocation SLoc) { + DiagnosticsEngine &Diags = CGM.getDiags(); + unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Warning, + "Failed to emit spawn"); + Diags.Report(SLoc, DiagID); + } + + // RAII for automatically popping detach scopes at the end of code-generating + // an expression. + class DetachScopeRAII { + CodeGenFunction &CGF; + CodeGenFunction::DetachScope *StartingDetachScope; + public: + DetachScopeRAII(CodeGenFunction &CGF) + : CGF(CGF), StartingDetachScope(CGF.CurDetachScope) {} + ~DetachScopeRAII() { + if (!CGF.CurDetachScope || CGF.CurDetachScope == StartingDetachScope) + // No detach scope was pushed, so there's nothing to do. + return; + CGF.PopDetachScope(); + assert(CGF.CurDetachScope == StartingDetachScope && + "Unexpected detach scope"); + CGF.IsSpawned = false; + } + }; + + // Simple RAII object for creating an unassociated taskframe. + class TaskFrameScope { + CodeGenFunction &CGF; + + // Old alloca insertion points from the CGF to restore when we're done + // emitting the spawned task and associated taskframe. + llvm::AssertingVH OldAllocaInsertPt = nullptr; + + // A temporary invoke destination, maintained to handle the emission of + // detached.rethrow and taskframe.resume intrinsics on exception-handling + // paths out of a spawned task or its taskframe. + llvm::BasicBlock *TempInvokeDest = nullptr; + + // Old EH state from the CGF to restore when we're done emitting the spawned + // task and associated taskframe. + llvm::BasicBlock *OldEHResumeBlock = nullptr; + llvm::Value *OldExceptionSlot = nullptr; + llvm::AllocaInst *OldEHSelectorSlot = nullptr; + Address OldNormalCleanupDest = Address::invalid(); + + // Taskframe created separately from detach. + llvm::Value *TaskFrame = nullptr; + public: + TaskFrameScope(CodeGenFunction &CGF); + ~TaskFrameScope(); + + llvm::Value *getTaskFrame() const { return TaskFrame; } + + // Get a temporary destination for an invoke, creating a new one if + // necessary. + llvm::BasicBlock *getTempInvokeDest() { + if (!TempInvokeDest) + TempInvokeDest = CGF.createBasicBlock("temp.invoke.dest"); + return TempInvokeDest; + } + }; + + /// Cleanup to ensure a taskframe is ended with a taskframe.resume on an + /// exception-handling path. + struct EndUnassocTaskFrame final : public EHScopeStack::Cleanup { + TaskFrameScope *TFScope; + public: + EndUnassocTaskFrame(TaskFrameScope *TFScope) : TFScope(TFScope) {} + void Emit(CodeGenFunction &CGF, Flags F) override { + if (F.isForNormalCleanup()) { + // For normal cleanups, just insert a call to taskframe.end. + llvm::Function *TaskFrameEnd = + CGF.CGM.getIntrinsic(llvm::Intrinsic::taskframe_end); + assert(TFScope->getTaskFrame() && "No taskframe in TFScope"); + CGF.Builder.CreateCall(TaskFrameEnd, { TFScope->getTaskFrame() }); + return; + } + + // Recreate the landingpad's return value for the rethrow invoke. Tapir + // lowering will replace this rethrow with a resume. + llvm::Value *Exn = CGF.Builder.CreateLoad( + Address(CGF.ExceptionSlot, CGF.Int8PtrTy, CGF.getPointerAlign()), + "exn"); + llvm::Value *Sel = CGF.Builder.CreateLoad( + Address(CGF.EHSelectorSlot, CGF.Int32Ty, CharUnits::fromQuantity(4)), + "sel"); + llvm::Type *LPadType = + llvm::StructType::get(Exn->getType(), Sel->getType()); + llvm::Value *LPadVal = llvm::UndefValue::get(LPadType); + LPadVal = CGF.Builder.CreateInsertValue(LPadVal, Exn, 0, "lpad.val"); + LPadVal = CGF.Builder.CreateInsertValue(LPadVal, Sel, 1, "lpad.val"); + + llvm::Function *TaskFrameResume = + CGF.CGM.getIntrinsic(llvm::Intrinsic::taskframe_resume, + { LPadVal->getType() }); + CGF.Builder.CreateInvoke(TaskFrameResume, CGF.getUnreachableBlock(), + TFScope->getTempInvokeDest(), + { TFScope->getTaskFrame(), LPadVal }); + CGF.Builder.SetInsertPoint(TFScope->getTempInvokeDest()); + } + }; + /// Takes the old cleanup stack size and emits the cleanup blocks /// that have been added. void PopCleanupBlocks(EHScopeStack::stable_iterator OldCleanupStackSize, - std::initializer_list ValuesToReload = {}); + std::initializer_list ValuesToReload = {}, + bool AfterSync = false); /// Takes the old cleanup stack size and emits the cleanup blocks /// that have been added, then adds all lifetime-extended cleanups from @@ -1235,7 +1796,8 @@ class CodeGenFunction : public CodeGenTypeCache { void PopCleanupBlocks(EHScopeStack::stable_iterator OldCleanupStackSize, size_t OldLifetimeExtendedStackSize, - std::initializer_list ValuesToReload = {}); + std::initializer_list ValuesToReload = {}, + bool AfterSync = false); void ResolveBranchFixups(llvm::BasicBlock *Target); @@ -1258,7 +1820,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// EmitBranchThroughCleanup - Emit a branch from the current insert /// block through the normal cleanup handling code (if any) and then /// on to \arg Dest. - void EmitBranchThroughCleanup(JumpDest Dest); + void EmitBranchThroughCleanup(JumpDest Dest, bool AfterSync = false); /// isObviouslyBranchWithoutCleanups - Return true if a branch to the /// specified destination obviously has no cleanups to run. 'false' is always @@ -4809,7 +5371,8 @@ class CodeGenFunction : public CodeGenTypeCache { /// EmitScalarExpr - Emit the computation of the specified expression of LLVM /// scalar type, returning the result. - llvm::Value *EmitScalarExpr(const Expr *E , bool IgnoreResultAssign = false); + llvm::Value *EmitScalarExpr(const Expr *E, bool IgnoreResultAssign = false); + void EmitScalarExprIntoLValue(const Expr *E, LValue dest, bool isInit); /// Emit a conversion from the specified type to the specified destination /// type, both of which are LLVM scalar types. diff --git a/clang/lib/CodeGen/EHScopeStack.h b/clang/lib/CodeGen/EHScopeStack.h index 0c667e80bb6d8cf..e371860c1251e69 100644 --- a/clang/lib/CodeGen/EHScopeStack.h +++ b/clang/lib/CodeGen/EHScopeStack.h @@ -87,6 +87,8 @@ enum CleanupKind : unsigned { LifetimeMarker = 0x8, NormalEHLifetimeMarker = LifetimeMarker | NormalAndEHCleanup, + + TaskExit = 0x10, }; /// A stack of scopes which respond to exceptions, including cleanups diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt index 32a4378ab499faf..7a1cae2c2bcc81d 100644 --- a/clang/lib/Driver/CMakeLists.txt +++ b/clang/lib/Driver/CMakeLists.txt @@ -27,6 +27,7 @@ add_clang_library(clangDriver OptionUtils.cpp Phases.cpp SanitizerArgs.cpp + Tapir.cpp Tool.cpp ToolChain.cpp ToolChains/Arch/AArch64.cpp diff --git a/clang/lib/Driver/Tapir.cpp b/clang/lib/Driver/Tapir.cpp new file mode 100644 index 000000000000000..7a4b9eedc7bd44d --- /dev/null +++ b/clang/lib/Driver/Tapir.cpp @@ -0,0 +1,76 @@ +//===--- Tapir.cpp - C Language Family Language Options ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the functions from Tapir.h +// +//===----------------------------------------------------------------------===// + +#include "clang/Driver/Tapir.h" +#include "clang/Driver/Options.h" +#include "clang/Driver/ToolChain.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Option/Arg.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Transforms/Tapir/TapirTargetIDs.h" + +using namespace clang::driver; +using namespace clang; +using namespace llvm::opt; + +TapirTargetID clang::parseTapirTarget(const ArgList &Args) { + // Use Cilk if -ftapir is not specified but -fcilkplus is specified. + if (!Args.hasArg(options::OPT_ftapir_EQ)) { + return TapirTargetID::None; + } + + // Otherwise use the runtime specified by -ftapir. + TapirTargetID TapirTarget = TapirTargetID::None; + if (const Arg *A = Args.getLastArg(options::OPT_ftapir_EQ)) + TapirTarget = llvm::StringSwitch(A->getValue()) + .Case("none", TapirTargetID::None) + .Case("serial", TapirTargetID::Serial) + .Case("cheetah", TapirTargetID::Cheetah) + .Case("lambda", TapirTargetID::Lambda) + .Case("omptask", TapirTargetID::OMPTask) + .Case("opencilk", TapirTargetID::OpenCilk) + .Case("qthreads", TapirTargetID::Qthreads) + .Default(TapirTargetID::Last_TapirTargetID); + + return TapirTarget; +} + +std::optional +clang::serializeTapirTarget(TapirTargetID Target) { + std::optional TapirTargetStr; + switch (Target) { + case TapirTargetID::None: + TapirTargetStr = "none"; + break; + case TapirTargetID::Serial: + TapirTargetStr = "serial"; + break; + case TapirTargetID::Cheetah: + TapirTargetStr = "cheetah"; + break; + case TapirTargetID::Lambda: + TapirTargetStr = "lambda"; + break; + case TapirTargetID::OMPTask: + TapirTargetStr = "omptask"; + break; + case TapirTargetID::OpenCilk: + TapirTargetStr = "opencilk"; + break; + case TapirTargetID::Qthreads: + TapirTargetStr = "qthreads"; + break; + case TapirTargetID::Last_TapirTargetID: + break; + } + return TapirTargetStr; +} diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 20a555afb8092f9..2cf9407fcb66828 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -23,6 +23,7 @@ #include "clang/Driver/Job.h" #include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Driver/Tapir.h" #include "clang/Driver/XRayArgs.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" @@ -1618,3 +1619,272 @@ llvm::opt::DerivedArgList *ToolChain::TranslateXarchArgs( delete DAL; return nullptr; } + +void ToolChain::AddOpenCilkIncludeDir(const ArgList &Args, + ArgStringList &CmdArgs) const { + if (!Args.hasArg(options::OPT_opencilk_resource_dir_EQ)) + return; + + const Arg *A = Args.getLastArg(options::OPT_opencilk_resource_dir_EQ); + SmallString<128> P; + + // Check for an include directory. + P.assign(A->getValue()); + llvm::sys::path::append(P, "include"); + if (getVFS().exists(P)) { + addSystemInclude(Args, CmdArgs, P); + } else { + D.Diag(diag::err_drv_opencilk_resource_dir_missing_include) + << A->getAsString(Args); + } +} + +ToolChain::path_list +ToolChain::getOpenCilkRuntimePaths(const ArgList &Args) const { + path_list Paths; + + if (!Args.hasArg(options::OPT_opencilk_resource_dir_EQ)) { + Paths = getRuntimePaths(); + Paths.push_back(getCompilerRTPath()); + return Paths; + } + + // If -opencilk-resource-dir= is specified, try to use that directory, and + // raise an error if that fails. + const Arg *A = Args.getLastArg(options::OPT_opencilk_resource_dir_EQ); + + // Try the triple passed to driver as --target=. + { + SmallString<128> P(A->getValue()); + llvm::sys::path::append(P, "lib", getTriple().str()); + Paths.push_back(std::string(P.str())); + } + // Try excluding the triple. + { + SmallString<128> P(A->getValue()); + if (Triple.isOSUnknown()) { + llvm::sys::path::append(P, "lib"); + } else { + llvm::sys::path::append(P, "lib", getOSLibName()); + } + Paths.push_back(std::string(P.str())); + } + + return Paths; +} + +static void addOpenCilkRuntimeRunPath(const ToolChain &TC, const ArgList &Args, + ArgStringList &CmdArgs, + const llvm::Triple &Triple) { + // Allow the -fno-rtlib-add-rpath flag to prevent adding this default + // directory to the runpath. + if (!Args.hasFlag(options::OPT_frtlib_add_rpath, + options::OPT_fno_rtlib_add_rpath, true)) + return; + + bool FoundCandidate = false; + for (auto CandidateRPath : TC.getOpenCilkRuntimePaths(Args)) { + if (TC.getVFS().exists(CandidateRPath)) { + FoundCandidate = true; + CmdArgs.push_back("-L"); + CmdArgs.push_back(Args.MakeArgString(CandidateRPath.c_str())); + CmdArgs.push_back("-rpath"); + CmdArgs.push_back(Args.MakeArgString(CandidateRPath.c_str())); + } + } + if (FoundCandidate && Triple.isOSBinFormatELF()) + CmdArgs.push_back("--enable-new-dtags"); +} + +static StringRef getArchNameForOpenCilkRTLib(const ToolChain &TC, + const ArgList &Args) { + return getArchNameForCompilerRTLib(TC, Args); +} + +std::string ToolChain::getOpenCilkBCBasename(const ArgList &Args, + StringRef Component, + bool AddArch) const { + const llvm::Triple &TT = getTriple(); + const char *Prefix = "lib"; + const char *Suffix = ".bc"; + std::string ArchAndEnv; + if (AddArch) { + StringRef Arch = getArchNameForOpenCilkRTLib(*this, Args); + const char *Env = TT.isAndroid() ? "-android" : ""; + ArchAndEnv = ("-" + Arch + Env).str(); + } + return (Prefix + Component + ArchAndEnv + Suffix).str(); +} + +std::optional ToolChain::getOpenCilkBC(const ArgList &Args, + StringRef Component) const { + // Check for runtime files without the architecture first. + std::string BCBasename = + getOpenCilkBCBasename(Args, Component, /*AddArch=*/false); + for (auto RuntimePath : getOpenCilkRuntimePaths(Args)) { + SmallString<128> P(RuntimePath); + llvm::sys::path::append(P, BCBasename); + if (getVFS().exists(P)) + return std::optional(std::string(P.str())); + } + + // Fall back to the OpenCilk name with the arch if the no-arch version does + // not exist. + BCBasename = getOpenCilkBCBasename(Args, Component, /*AddArch=*/true); + for (auto RuntimePath : getOpenCilkRuntimePaths(Args)) { + SmallString<128> P(RuntimePath); + llvm::sys::path::append(P, BCBasename); + if (getVFS().exists(P)) + return std::optional(std::string(P.str())); + } + + return std::nullopt; +} + +void ToolChain::AddOpenCilkABIBitcode(const ArgList &Args, + ArgStringList &CmdArgs, + bool IsLTO) const { + // If --opencilk-abi-bitcode= is specified, use that specified path. + if (Args.hasArg(options::OPT_opencilk_abi_bitcode_EQ)) { + const Arg *A = Args.getLastArg(options::OPT_opencilk_abi_bitcode_EQ); + SmallString<128> P(A->getValue()); + if (!getVFS().exists(P)) { + getDriver().Diag(diag::err_drv_opencilk_missing_abi_bitcode) + << A->getAsString(Args); + } + if (IsLTO) + CmdArgs.push_back( + Args.MakeArgString("--plugin-opt=opencilk-abi-bitcode=" + P)); + } + + bool UseAsan = getSanitizerArgs(Args).needsAsanRt(); + StringRef OpenCilkBCName = UseAsan ? "opencilk-asan-abi" : "opencilk-abi"; + if (auto OpenCilkABIBCFilename = getOpenCilkBC(Args, OpenCilkBCName)) { + if (IsLTO) + CmdArgs.push_back(Args.MakeArgString("--plugin-opt=opencilk-abi-bitcode=" + + *OpenCilkABIBCFilename)); + else + CmdArgs.push_back(Args.MakeArgString("--opencilk-abi-bitcode=" + + *OpenCilkABIBCFilename)); + return; + } + + // Error if we could not find a bitcode file. + getDriver().Diag(diag::err_drv_opencilk_missing_abi_bitcode) + << getOpenCilkBCBasename(Args, OpenCilkBCName, /*AddArch=*/false); +} + +std::string ToolChain::getOpenCilkRTBasename(const ArgList &Args, + StringRef Component, + FileType Type, + bool AddArch) const { + const llvm::Triple &TT = getTriple(); + const char *Prefix = "lib"; + const char *Suffix; + switch (Type) { + case ToolChain::FT_Object: + Suffix = ".o"; + break; + case ToolChain::FT_Static: + Suffix = ".a"; + break; + case ToolChain::FT_Shared: + Suffix = ".so"; + break; + } + std::string ArchAndEnv; + if (AddArch) { + StringRef Arch = getArchNameForOpenCilkRTLib(*this, Args); + const char *Env = TT.isAndroid() ? "-android" : ""; + ArchAndEnv = ("-" + Arch + Env).str(); + } + return (Prefix + Component + ArchAndEnv + Suffix).str(); +} + +std::string ToolChain::getOpenCilkRT(const ArgList &Args, StringRef Component, + FileType Type) const { + // Check for runtime files without the architecture first. + std::string RTBasename = + getOpenCilkRTBasename(Args, Component, Type, /*AddArch=*/false); + if (Args.hasArg(options::OPT_opencilk_resource_dir_EQ)) { + // If opencilk-resource-dir is specified, look for the library in that + // directory. + for (auto RuntimePath : getOpenCilkRuntimePaths(Args)) { + SmallString<128> P(RuntimePath); + llvm::sys::path::append(P, RTBasename); + if (getVFS().exists(P)) + return std::string(P.str()); + } + } else { + for (const auto &LibPath : getLibraryPaths()) { + SmallString<128> P(LibPath); + llvm::sys::path::append(P, RTBasename); + if (getVFS().exists(P)) + // If we found the library in LibraryPaths, let the linker resolve it. + return std::string(("-l" + Component).str()); + } + } + + // Fall back to the OpenCilk name with the arch if the no-arch version does + // not exist. + RTBasename = getOpenCilkRTBasename(Args, Component, Type, /*AddArch=*/true); + for (auto RuntimePath : getOpenCilkRuntimePaths(Args)) { + SmallString<128> P(RuntimePath); + llvm::sys::path::append(P, RTBasename); + if (getVFS().exists(P)) + return std::string(P.str()); + } + + // Otherwise, trust the linker to find the library on the system. + return std::string(("-l" + Component).str()); +} + +void ToolChain::AddTapirRuntimeLibArgs(const ArgList &Args, + ArgStringList &CmdArgs) const { + TapirTargetID TapirTarget = parseTapirTarget(Args); + if (TapirTarget == TapirTargetID::Last_TapirTargetID) + if (const Arg *A = Args.getLastArg(options::OPT_ftapir_EQ)) + getDriver().Diag(diag::err_drv_invalid_value) << A->getAsString(Args) + << A->getValue(); + + switch (TapirTarget) { + case TapirTargetID::Cheetah: + CmdArgs.push_back("-lcheetah"); + CmdArgs.push_back("-lpthread"); + break; + case TapirTargetID::OpenCilk: { + bool StaticOpenCilk = Args.hasArg(options::OPT_static); + bool UseAsan = getSanitizerArgs(Args).needsAsanRt(); + + // Link the correct Cilk personality fn + if (getDriver().CCCIsCXX()) + CmdArgs.push_back(Args.MakeArgString(getOpenCilkRT( + Args, + UseAsan ? "opencilk-asan-personality-cpp" + : "opencilk-personality-cpp", + StaticOpenCilk ? ToolChain::FT_Static : ToolChain::FT_Shared))); + else + CmdArgs.push_back(Args.MakeArgString(getOpenCilkRT( + Args, + UseAsan ? "opencilk-asan-personality-c" : "opencilk-personality-c", + StaticOpenCilk ? ToolChain::FT_Static : ToolChain::FT_Shared))); + + // Link the opencilk runtime. We do this after linking the personality + // function, to ensure that symbols are resolved correctly when using static + // linking. + CmdArgs.push_back(Args.MakeArgString(getOpenCilkRT( + Args, UseAsan ? "opencilk-asan" : "opencilk", + StaticOpenCilk ? ToolChain::FT_Static : ToolChain::FT_Shared))); + + // Add to the executable's runpath the default directory containing OpenCilk + // runtime. + addOpenCilkRuntimeRunPath(*this, Args, CmdArgs, Triple); + break; + } + case TapirTargetID::Qthreads: + CmdArgs.push_back("-lqthread"); + break; + default: + break; + } +} diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 8858c318aba7a1a..68f6aecec4b20d4 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1234,6 +1234,9 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, // OBJCPLUS_INCLUDE_PATH - system includes enabled when compiling ObjC++. addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH"); + // If a custom OpenCilk resource directory is specified, add its include path. + getToolChain().AddOpenCilkIncludeDir(Args, CmdArgs); + // While adding the include arguments, we also attempt to retrieve the // arguments of related offloading toolchains or arguments that are specific // of an offloading programming model. @@ -6626,6 +6629,70 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_show_template_tree); Args.AddLastArg(CmdArgs, options::OPT_fno_elide_type); + // Forward flags for Cilk. + Args.AddLastArg(CmdArgs, options::OPT_ftapir_EQ); + if (Args.hasArg(options::OPT_ftapir_EQ)) { + auto const &Triple = getToolChain().getTriple(); + + // FIXME KITSUNE: Change the unsupported cilk diagnostic to kitsune. + // At least one runtime has been implemented for these operating systems. + if (!Triple.isOSLinux() && !Triple.isOSFreeBSD() && !Triple.isMacOSX()) + D.Diag(diag::err_drv_cilk_unsupported); + + /* JFC: Is it possible to confuse with with -fno-opencilk? */ + bool OpenCilk = false; + bool Cheetah = false; + bool CustomTarget = false; + + if (Arg *TapirRuntime = Args.getLastArgNoClaim(options::OPT_ftapir_EQ)) { + Cheetah = TapirRuntime->getValue() == StringRef("cheetah"); + if (TapirRuntime->getValue() == StringRef("opencilk")) { + OpenCilk = true; + } else { + CustomTarget = true; + } + } + + // FIXME KITSUNE: Change the unsupported cilk diagnostic to kitsune. + if (Cheetah && Triple.getArch() != llvm::Triple::x86_64) { + D.Diag(diag::err_drv_cilk_unsupported); + } + if (OpenCilk) { + switch (Triple.getArch()) { + case llvm::Triple::x86: + case llvm::Triple::x86_64: + case llvm::Triple::arm: + case llvm::Triple::armeb: + case llvm::Triple::aarch64: + case llvm::Triple::aarch64_be: + break; + default: + // FIXME KITSUNE: Change the unsupported cilk diagnostic to kitsune. + D.Diag(diag::err_drv_cilk_unsupported); + break; + } + + // If an OpenCilk resource directory is specified, check that it is valid. + if (Args.hasArgNoClaim(options::OPT_opencilk_resource_dir_EQ)) { + bool ValidPathFound = false; + for (auto Path : getToolChain().getOpenCilkRuntimePaths(Args)) { + if (D.getVFS().exists(Path)) { + ValidPathFound = true; + break; + } + } + if (!ValidPathFound) + D.Diag(diag::err_drv_opencilk_resource_dir_missing_lib) + << Args.getLastArgNoClaim(options::OPT_opencilk_resource_dir_EQ) + ->getAsString(Args); + } + + if (!CustomTarget) + // Add the OpenCilk ABI bitcode file. + getToolChain().AddOpenCilkABIBitcode(Args, CmdArgs); + } + } + // Forward flags for OpenMP. We don't do this if the current action is an // device offloading action other than OpenMP. if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, @@ -7439,6 +7506,15 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_slp_vectorize, EnableSLPVec)) CmdArgs.push_back("-vectorize-slp"); + // -fstripmine is enabled based on the optimization level selected. For now, + // we enable stripmining when the optimization level enables vectorization. + bool EnableStripmine = EnableVec; + OptSpecifier StripmineAliasOption = + EnableStripmine ? options::OPT_O_Group : options::OPT_fstripmine; + if (Args.hasFlag(options::OPT_fstripmine, StripmineAliasOption, + options::OPT_fno_stripmine, EnableStripmine)) + CmdArgs.push_back("-stripmine-loops"); + ParseMPreferVectorWidth(D, Args, CmdArgs); Args.AddLastArg(CmdArgs, options::OPT_fshow_overloads_EQ); diff --git a/clang/lib/Driver/ToolChains/CloudABI.cpp b/clang/lib/Driver/ToolChains/CloudABI.cpp new file mode 100644 index 000000000000000..f5b8480fb0ec9c8 --- /dev/null +++ b/clang/lib/Driver/ToolChains/CloudABI.cpp @@ -0,0 +1,154 @@ +//===--- CloudABI.cpp - CloudABI ToolChain Implementations ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CloudABI.h" +#include "CommonArgs.h" +#include "clang/Driver/Compilation.h" +#include "clang/Driver/Driver.h" +#include "clang/Driver/InputInfo.h" +#include "clang/Driver/Options.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Support/Path.h" + +using namespace clang::driver; +using namespace clang::driver::tools; +using namespace clang::driver::toolchains; +using namespace clang; +using namespace llvm::opt; + +void cloudabi::Linker::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + const ToolChain &ToolChain = getToolChain(); + const Driver &D = ToolChain.getDriver(); + ArgStringList CmdArgs; + + // Silence warning for "clang -g foo.o -o foo" + Args.ClaimAllArgs(options::OPT_g_Group); + // and "clang -emit-llvm foo.o -o foo" + Args.ClaimAllArgs(options::OPT_emit_llvm); + // and for "clang -w foo.o -o foo". Other warning options are already + // handled somewhere else. + Args.ClaimAllArgs(options::OPT_w); + + if (!D.SysRoot.empty()) + CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot)); + + // CloudABI only supports static linkage. + CmdArgs.push_back("-Bstatic"); + CmdArgs.push_back("--no-dynamic-linker"); + + // Provide PIE linker flags in case PIE is default for the architecture. + if (ToolChain.isPIEDefault(Args)) { + CmdArgs.push_back("-pie"); + CmdArgs.push_back("-zrelro"); + } + + CmdArgs.push_back("--eh-frame-hdr"); + CmdArgs.push_back("--gc-sections"); + + if (Output.isFilename()) { + CmdArgs.push_back("-o"); + CmdArgs.push_back(Output.getFilename()); + } else { + assert(Output.isNothing() && "Invalid output."); + } + + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) { + CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crt0.o"))); + CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtbegin.o"))); + } + + Args.AddAllArgs(CmdArgs, options::OPT_L); + ToolChain.AddFilePathLibArgs(Args, CmdArgs); + Args.AddAllArgs(CmdArgs, + {options::OPT_T_Group, options::OPT_s, options::OPT_t, + options::OPT_Z_Flag, options::OPT_r}); + + if (D.isUsingLTO()) { + assert(!Inputs.empty() && "Must have at least one input."); + addLTOOptions(ToolChain, Args, CmdArgs, Output, Inputs[0], + D.getLTOMode() == LTOK_Thin); + } + + AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA); + + if (ToolChain.ShouldLinkCXXStdlib(Args)) + ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs); + + ToolChain.AddTapirRuntimeLibArgs(Args, CmdArgs); + + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { + CmdArgs.push_back("-lc"); + CmdArgs.push_back("-lcompiler_rt"); + } + + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) + CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtend.o"))); + + const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath()); + C.addCommand(std::make_unique(JA, *this, + ResponseFileSupport::AtFileCurCP(), + Exec, CmdArgs, Inputs, Output)); +} + +// CloudABI - CloudABI tool chain which can call ld(1) directly. + +CloudABI::CloudABI(const Driver &D, const llvm::Triple &Triple, + const ArgList &Args) + : Generic_ELF(D, Triple, Args) { + SmallString<128> P(getDriver().Dir); + llvm::sys::path::append(P, "..", getTriple().str(), "lib"); + getFilePaths().push_back(std::string(P.str())); +} + +void CloudABI::addLibCxxIncludePaths(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const { + SmallString<128> P(getDriver().Dir); + llvm::sys::path::append(P, "..", getTriple().str(), "include/c++/v1"); + addSystemInclude(DriverArgs, CC1Args, P.str()); +} + +void CloudABI::AddCXXStdlibLibArgs(const ArgList &Args, + ArgStringList &CmdArgs) const { + CmdArgs.push_back("-lc++"); + if (Args.hasArg(options::OPT_fexperimental_library)) + CmdArgs.push_back("-lc++experimental"); + CmdArgs.push_back("-lc++abi"); + CmdArgs.push_back("-lunwind"); +} + +Tool *CloudABI::buildLinker() const { + return new tools::cloudabi::Linker(*this); +} + +bool CloudABI::isPIEDefault(const llvm::opt::ArgList &Args) const { + // Only enable PIE on architectures that support PC-relative + // addressing. PC-relative addressing is required, as the process + // startup code must be able to relocate itself. + switch (getTriple().getArch()) { + case llvm::Triple::aarch64: + case llvm::Triple::x86_64: + return true; + default: + return false; + } +} + +SanitizerMask CloudABI::getSupportedSanitizers() const { + SanitizerMask Res = ToolChain::getSupportedSanitizers(); + Res |= SanitizerKind::SafeStack; + return Res; +} + +SanitizerMask CloudABI::getDefaultSanitizers() const { + return SanitizerKind::SafeStack; +} diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 019df16a909f4e3..7de0189c951fb40 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -337,6 +337,16 @@ static bool shouldIgnoreUnsupportedTargetFeature(const Arg &TargetFeatureArg, return TargetFeatureArg.getOption().matches(options::OPT_mno_cumode); } +static void renderTapirLoweringOptions(const ArgList &Args, + ArgStringList &CmdArgs, + const ToolChain &TC) { + if (Args.hasArg(options::OPT_ftapir_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_ftapir_EQ)) + CmdArgs.push_back(Args.MakeArgString( + Twine("--plugin-opt=tapir-target=") + A->getValue())); + } +} + void tools::addPathIfExists(const Driver &D, const Twine &Path, ToolChain::path_list &Paths) { if (D.getVFS().exists(Path)) @@ -1120,6 +1130,8 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, // Handle remarks hotness/threshold related options. renderRemarksHotnessOptions(Args, CmdArgs, PluginOptPrefix); + renderTapirLoweringOptions(Args, CmdArgs, ToolChain); + addMachineOutlinerArgs(D, Args, CmdArgs, ToolChain.getEffectiveTriple(), /*IsLTO=*/true, PluginOptPrefix); diff --git a/clang/lib/Driver/ToolChains/CrossWindows.cpp b/clang/lib/Driver/ToolChains/CrossWindows.cpp index 3c5dfba329cf8e8..eba34c6890ba51a 100644 --- a/clang/lib/Driver/ToolChains/CrossWindows.cpp +++ b/clang/lib/Driver/ToolChains/CrossWindows.cpp @@ -178,6 +178,8 @@ void tools::CrossWindows::Linker::ConstructJob( CmdArgs.push_back("-Bdynamic"); } + TC.AddTapirRuntimeLibArgs(Args, CmdArgs); + if (!Args.hasArg(options::OPT_nostdlib)) { if (!Args.hasArg(options::OPT_nodefaultlibs)) { // TODO handle /MT[d] /MD[d] diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index e576efaf5ca884c..e331dab642bb741 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -18,6 +18,7 @@ #include "clang/Driver/DriverDiagnostic.h" #include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Driver/Tapir.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" @@ -557,6 +558,19 @@ static void renderRemarksOptions(const ArgList &Args, ArgStringList &CmdArgs, } } +static void renderTapirLoweringOptions(const ArgList &Args, + ArgStringList &CmdArgs, + const ToolChain &TC, bool LinkerIsLLD) { + if (!(TC.getDriver().isUsingLTO() && LinkerIsLLD)) + return; + + if (Args.hasArg(options::OPT_ftapir_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_ftapir_EQ)) + CmdArgs.push_back( + Args.MakeArgString(Twine("--tapir-target=") + A->getValue())); + } +} + static void AppendPlatformPrefix(SmallString<128> &Path, const llvm::Triple &T); void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA, @@ -610,6 +624,8 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA, getToolChain().getTriple())) renderRemarksOptions(Args, CmdArgs, getToolChain().getTriple(), Output, JA); + renderTapirLoweringOptions(Args, CmdArgs, getToolChain(), LinkerIsLLD); + // Propagate the -moutline flag to the linker in LTO. if (Arg *A = Args.getLastArg(options::OPT_moutline, options::OPT_mno_outline)) { @@ -742,6 +758,8 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA, } } + getMachOToolChain().AddLinkTapirRuntime(Args, CmdArgs); + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) { // endfile_spec is empty. } @@ -3538,3 +3556,168 @@ void Darwin::printVerboseInfo(raw_ostream &OS) const { CudaInstallation->print(OS); RocmInstallation->print(OS); } + +ToolChain::path_list +DarwinClang::getOpenCilkRuntimePaths(const ArgList &Args) const { + path_list Paths; + if (!Args.hasArg(options::OPT_opencilk_resource_dir_EQ)) { + SmallString<128> P(getDriver().ResourceDir); + llvm::sys::path::append(P, "lib", "darwin"); + Paths.push_back(std::string(P.str())); + return Paths; + } + + // If -opencilk-resource-dir= is specified, try to use that directory, and + // raise an error if that fails. + const Arg *A = Args.getLastArg(options::OPT_opencilk_resource_dir_EQ); + + // Try the lib/darwin subdirectory + { + SmallString<128> P(A->getValue()); + llvm::sys::path::append(P, "lib", "darwin"); + Paths.push_back(std::string(P.str())); + } + // Try the lib subdirectory + { + SmallString<128> P(A->getValue()); + llvm::sys::path::append(P, "lib"); + Paths.push_back(std::string(P.str())); + } + return Paths; +} + +void DarwinClang::AddOpenCilkABIBitcode(const ArgList &Args, + ArgStringList &CmdArgs, + bool IsLTO) const { + // If --opencilk-abi-bitcode= is specified, use that specified path. + if (Args.hasArg(options::OPT_opencilk_abi_bitcode_EQ)) { + const Arg *A = Args.getLastArg(options::OPT_opencilk_abi_bitcode_EQ); + SmallString<128> P(A->getValue()); + if (!getVFS().exists(P)) + getDriver().Diag(diag::err_drv_opencilk_missing_abi_bitcode) + << A->getAsString(Args); + if (IsLTO) + CmdArgs.push_back( + Args.MakeArgString("--opencilk-abi-bitcode=" + P)); + } + + bool UseAsan = getSanitizerArgs(Args).needsAsanRt(); + SmallString<128> BitcodeFilename(UseAsan ? "libopencilk-asan-abi" + : "libopencilk-abi"); + BitcodeFilename += "_"; + BitcodeFilename += getOSLibraryNameSuffix(); + BitcodeFilename += ".bc"; + + for (auto RuntimePath : getOpenCilkRuntimePaths(Args)) { + SmallString<128> P(RuntimePath); + llvm::sys::path::append(P, BitcodeFilename); + if (getVFS().exists(P)) { + // The same argument works regardless of IsLTO. + CmdArgs.push_back(Args.MakeArgString("--opencilk-abi-bitcode=" + P)); + return; + } + } + getDriver().Diag(diag::err_drv_opencilk_missing_abi_bitcode) + << BitcodeFilename; +} + +void DarwinClang::AddLinkTapirRuntimeLib(const ArgList &Args, + ArgStringList &CmdArgs, + StringRef LibName, + RuntimeLinkOptions Opts, + bool IsShared) const { + SmallString<64> DarwinLibName = StringRef("lib"); + DarwinLibName += LibName; + DarwinLibName += "_"; + DarwinLibName += getOSLibraryNameSuffix(); + DarwinLibName += IsShared ? "_dynamic.dylib" : ".a"; + SmallString<128> Dir(getDriver().ResourceDir); + if (Args.hasArg(options::OPT_opencilk_resource_dir_EQ)) { + for (auto OpenCilkRuntimeDir : getOpenCilkRuntimePaths(Args)) { + if (getVFS().exists(OpenCilkRuntimeDir)) { + Dir.assign(OpenCilkRuntimeDir); + break; + } + } + } else { + llvm::sys::path::append( + Dir, "lib", (Opts & RLO_IsEmbedded) ? "macho_embedded" : "darwin"); + } + + SmallString<128> P(Dir); + llvm::sys::path::append(P, DarwinLibName); + + // For now, allow missing resource libraries to support developers who may + // not have compiler-rt checked out or integrated into their build (unless + // we explicitly force linking with this library). + if ((Opts & RLO_AlwaysLink) || getVFS().exists(P)) { + const char *LibArg = Args.MakeArgString(P); + CmdArgs.push_back(LibArg); + } + + // Adding the rpaths might negatively interact when other rpaths are involved, + // so we should make sure we add the rpaths last, after all user-specified + // rpaths. This is currently true from this place, but we need to be + // careful if this function is ever called before user's rpaths are emitted. + if (Opts & RLO_AddRPath) { + assert(DarwinLibName.endswith(".dylib") && "must be a dynamic library"); + + // Add @executable_path to rpath to support having the dylib copied with + // the executable. + CmdArgs.push_back("-rpath"); + CmdArgs.push_back("@executable_path"); + + // Add the path to the resource dir to rpath to support using the dylib + // from the default location without copying. + CmdArgs.push_back("-rpath"); + CmdArgs.push_back(Args.MakeArgString(Dir)); + } +} + +void DarwinClang::AddLinkTapirRuntime(const ArgList &Args, + ArgStringList &CmdArgs) const { + TapirTargetID TapirTarget = parseTapirTarget(Args); + if (TapirTarget == TapirTargetID::Last_TapirTargetID) + if (const Arg *A = Args.getLastArg(options::OPT_ftapir_EQ)) + getDriver().Diag(diag::err_drv_invalid_value) << A->getAsString(Args) + << A->getValue(); + + switch (TapirTarget) { + case TapirTargetID::Cheetah: + CmdArgs.push_back("-lcheetah"); + break; + case TapirTargetID::OpenCilk: { + bool StaticOpenCilk = false; + bool UseAsan = getSanitizerArgs(Args).needsAsanRt(); + + auto RLO = RLO_AlwaysLink; + if (!StaticOpenCilk) + RLO = RuntimeLinkOptions(RLO | RLO_AddRPath); + + // Link the correct Cilk personality fn + if (getDriver().CCCIsCXX()) + AddLinkTapirRuntimeLib(Args, CmdArgs, + UseAsan ? "opencilk-asan-personality-cpp" + : "opencilk-personality-cpp", + RLO, !StaticOpenCilk); + else + AddLinkTapirRuntimeLib(Args, CmdArgs, + UseAsan ? "opencilk-asan-personality-c" + : "opencilk-personality-c", + RLO, !StaticOpenCilk); + + // Link the opencilk runtime. We do this after linking the personality + // function, to ensure that symbols are resolved correctly when using static + // linking. + AddLinkTapirRuntimeLib(Args, CmdArgs, + UseAsan ? "opencilk-asan" : "opencilk", RLO, + !StaticOpenCilk); + break; + } + case TapirTargetID::Qthreads: + CmdArgs.push_back("-lqthread"); + break; + default: + break; + } +} diff --git a/clang/lib/Driver/ToolChains/Darwin.h b/clang/lib/Driver/ToolChains/Darwin.h index 2e55b49682a7e97..d40eb7939ecf049 100644 --- a/clang/lib/Driver/ToolChains/Darwin.h +++ b/clang/lib/Driver/ToolChains/Darwin.h @@ -172,6 +172,10 @@ class LLVM_LIBRARY_VISIBILITY MachO : public ToolChain { virtual void AddLinkARCArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const {} + /// Add the linker arguments to link a Tapir runtime library. + virtual void AddLinkTapirRuntime(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) const {} + /// Add the linker arguments to link the compiler runtime library. /// /// FIXME: This API is intended for use with embedded libraries only, and is @@ -631,6 +635,16 @@ class LLVM_LIBRARY_VISIBILITY DarwinClang : public Darwin { void AddLinkARCArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const override; + path_list + getOpenCilkRuntimePaths(const llvm::opt::ArgList &Args) const override; + + void AddOpenCilkABIBitcode(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + bool IsLTO = false) const override; + + void AddLinkTapirRuntime(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) const override; + unsigned GetDefaultDwarfVersion() const override; // Until dtrace (via CTF) and LLDB can deal with distributed debug info, // Darwin defaults to standalone/full debug info. @@ -646,6 +660,10 @@ class LLVM_LIBRARY_VISIBILITY DarwinClang : public Darwin { llvm::opt::ArgStringList &CmdArgs, StringRef Sanitizer, bool shared = true) const; + void AddLinkTapirRuntimeLib(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + StringRef LibName, RuntimeLinkOptions Opts, + bool IsShared) const; bool AddGnuCPlusPlusIncludePaths(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, diff --git a/clang/lib/Driver/ToolChains/DragonFly.cpp b/clang/lib/Driver/ToolChains/DragonFly.cpp index 1dbc46763c1156c..b0629b41ed6ef89 100644 --- a/clang/lib/Driver/ToolChains/DragonFly.cpp +++ b/clang/lib/Driver/ToolChains/DragonFly.cpp @@ -127,6 +127,8 @@ void dragonfly::Linker::ConstructJob(Compilation &C, const JobAction &JA, AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA); + getToolChain().AddTapirRuntimeLibArgs(Args, CmdArgs); + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs, options::OPT_r)) { if (!Static) { diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp index a8ee6540001ee4f..467d68a48f715db 100644 --- a/clang/lib/Driver/ToolChains/FreeBSD.cpp +++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp @@ -290,6 +290,9 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA, unsigned Major = ToolChain.getTriple().getOSMajorVersion(); bool Profiling = Args.hasArg(options::OPT_pg) && Major != 0 && Major < 14; + + ToolChain.AddTapirRuntimeLibArgs(Args, CmdArgs); + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs, options::OPT_r)) { // Use the static OpenMP runtime with -static-openmp @@ -326,6 +329,7 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA, linkSanitizerRuntimeDeps(ToolChain, Args, CmdArgs); if (NeedsXRayDeps) linkXRayRuntimeDeps(ToolChain, Args, CmdArgs); + // FIXME: For some reason GCC passes -lgcc and -lgcc_s before adding // the default system libraries. Just mimic this for now. if (Profiling) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 543f3965dfd4f6f..2fe920b834cf98c 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -550,6 +550,8 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA, // The profile runtime also needs access to system libraries. getToolChain().addProfileRTLibs(Args, CmdArgs); + ToolChain.AddTapirRuntimeLibArgs(Args, CmdArgs); + if (D.CCCIsCXX() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs, options::OPT_r)) { diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp index c81a7ed17029633..e6135195a8ddffc 100644 --- a/clang/lib/Driver/ToolChains/MinGW.cpp +++ b/clang/lib/Driver/ToolChains/MinGW.cpp @@ -255,6 +255,8 @@ void tools::MinGW::Linker::ConstructJob(Compilation &C, const JobAction &JA, addFortranRuntimeLibs(TC, Args, CmdArgs); } + TC.AddTapirRuntimeLibArgs(Args, CmdArgs); + // TODO: Add profile stuff here if (TC.ShouldLinkCXXStdlib(Args)) { diff --git a/clang/lib/Driver/ToolChains/Minix.cpp b/clang/lib/Driver/ToolChains/Minix.cpp new file mode 100644 index 000000000000000..e9665b712e59a12 --- /dev/null +++ b/clang/lib/Driver/ToolChains/Minix.cpp @@ -0,0 +1,118 @@ +//===--- Minix.cpp - Minix ToolChain Implementations ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Minix.h" +#include "CommonArgs.h" +#include "clang/Driver/Compilation.h" +#include "clang/Driver/Driver.h" +#include "clang/Driver/InputInfo.h" +#include "clang/Driver/Options.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Support/VirtualFileSystem.h" + +using namespace clang::driver; +using namespace clang; +using namespace llvm::opt; + +void tools::minix::Assembler::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + claimNoWarnArgs(Args); + ArgStringList CmdArgs; + + Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); + + CmdArgs.push_back("-o"); + CmdArgs.push_back(Output.getFilename()); + + for (const auto &II : Inputs) + CmdArgs.push_back(II.getFilename()); + + const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as")); + C.addCommand(std::make_unique(JA, *this, + ResponseFileSupport::AtFileCurCP(), + Exec, CmdArgs, Inputs, Output)); +} + +void tools::minix::Linker::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + const Driver &D = getToolChain().getDriver(); + ArgStringList CmdArgs; + + if (Output.isFilename()) { + CmdArgs.push_back("-o"); + CmdArgs.push_back(Output.getFilename()); + } else { + assert(Output.isNothing() && "Invalid output."); + } + + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles, + options::OPT_r)) { + CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crt1.o"))); + CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crti.o"))); + CmdArgs.push_back( + Args.MakeArgString(getToolChain().GetFilePath("crtbegin.o"))); + CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crtn.o"))); + } + + Args.AddAllArgs(CmdArgs, + {options::OPT_L, options::OPT_T_Group, options::OPT_e}); + + AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA); + + getToolChain().addProfileRTLibs(Args, CmdArgs); + + getToolChain().AddTapirRuntimeLibArgs(Args, CmdArgs); + + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs, + options::OPT_r)) { + if (D.CCCIsCXX()) { + if (getToolChain().ShouldLinkCXXStdlib(Args)) + getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs); + CmdArgs.push_back("-lm"); + } + } + + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles, + options::OPT_r)) { + if (Args.hasArg(options::OPT_pthread)) + CmdArgs.push_back("-lpthread"); + CmdArgs.push_back("-lc"); + CmdArgs.push_back("-lCompilerRT-Generic"); + CmdArgs.push_back("-L/usr/pkg/compiler-rt/lib"); + CmdArgs.push_back( + Args.MakeArgString(getToolChain().GetFilePath("crtend.o"))); + } + + const char *Exec = Args.MakeArgString(getToolChain().GetLinkerPath()); + C.addCommand(std::make_unique(JA, *this, + ResponseFileSupport::AtFileCurCP(), + Exec, CmdArgs, Inputs, Output)); +} + +/// Minix - Minix tool chain which can call as(1) and ld(1) directly. + +toolchains::Minix::Minix(const Driver &D, const llvm::Triple &Triple, + const ArgList &Args) + : Generic_ELF(D, Triple, Args) { + getFilePaths().push_back(getDriver().Dir + "/../lib"); + getFilePaths().push_back("/usr/lib"); +} + +Tool *toolchains::Minix::buildAssembler() const { + return new tools::minix::Assembler(*this); +} + +Tool *toolchains::Minix::buildLinker() const { + return new tools::minix::Linker(*this); +} diff --git a/clang/lib/Driver/ToolChains/Myriad.cpp b/clang/lib/Driver/ToolChains/Myriad.cpp new file mode 100644 index 000000000000000..530ecfaf2f287ae --- /dev/null +++ b/clang/lib/Driver/ToolChains/Myriad.cpp @@ -0,0 +1,295 @@ +//===--- Myriad.cpp - Myriad ToolChain Implementations ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Myriad.h" +#include "CommonArgs.h" +#include "clang/Driver/Compilation.h" +#include "clang/Driver/Driver.h" +#include "clang/Driver/DriverDiagnostic.h" +#include "clang/Driver/Options.h" +#include "llvm/Option/ArgList.h" + +using namespace clang::driver; +using namespace clang::driver::toolchains; +using namespace clang; +using namespace llvm::opt; + +using tools::addPathIfExists; + +void tools::SHAVE::Compiler::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + ArgStringList CmdArgs; + assert(Inputs.size() == 1); + const InputInfo &II = Inputs[0]; + assert(II.getType() == types::TY_C || II.getType() == types::TY_CXX || + II.getType() == types::TY_PP_CXX); + + if (JA.getKind() == Action::PreprocessJobClass) { + Args.ClaimAllArgs(); + CmdArgs.push_back("-E"); + } else { + assert(Output.getType() == types::TY_PP_Asm); // Require preprocessed asm. + CmdArgs.push_back("-S"); + CmdArgs.push_back("-fno-exceptions"); // Always do this even if unspecified. + } + CmdArgs.push_back("-DMYRIAD2"); + + // Append all -I, -iquote, -isystem paths, defines/undefines, 'f' + // flags, 'g' flags, 'M' flags, optimize flags, warning options, + // mcpu flags, mllvm flags, and Xclang flags. + // These are spelled the same way in clang and moviCompile. + Args.AddAllArgsExcept( + CmdArgs, + {options::OPT_I_Group, options::OPT_clang_i_Group, options::OPT_std_EQ, + options::OPT_D, options::OPT_U, options::OPT_f_Group, + options::OPT_f_clang_Group, options::OPT_g_Group, options::OPT_M_Group, + options::OPT_O_Group, options::OPT_W_Group, options::OPT_mcpu_EQ, + options::OPT_mllvm, options::OPT_Xclang}, + {options::OPT_fno_split_dwarf_inlining}); + Args.hasArg(options::OPT_fno_split_dwarf_inlining); // Claim it if present. + + // If we're producing a dependency file, and assembly is the final action, + // then the name of the target in the dependency file should be the '.o' + // file, not the '.s' file produced by this step. For example, instead of + // /tmp/mumble.s: mumble.c .../someheader.h + // the filename on the lefthand side should be "mumble.o" + if (Args.getLastArg(options::OPT_MF) && !Args.getLastArg(options::OPT_MT) && + C.getActions().size() == 1 && + C.getActions()[0]->getKind() == Action::AssembleJobClass) { + Arg *A = Args.getLastArg(options::OPT_o); + if (A) { + CmdArgs.push_back("-MT"); + CmdArgs.push_back(Args.MakeArgString(A->getValue())); + } + } + + CmdArgs.push_back(II.getFilename()); + CmdArgs.push_back("-o"); + CmdArgs.push_back(Output.getFilename()); + + std::string Exec = + Args.MakeArgString(getToolChain().GetProgramPath("moviCompile")); + C.addCommand(std::make_unique(JA, *this, ResponseFileSupport::None(), + Args.MakeArgString(Exec), CmdArgs, + Inputs, Output)); +} + +void tools::SHAVE::Assembler::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + ArgStringList CmdArgs; + + assert(Inputs.size() == 1); + const InputInfo &II = Inputs[0]; + assert(II.getType() == types::TY_PP_Asm); // Require preprocessed asm input. + assert(Output.getType() == types::TY_Object); + + CmdArgs.push_back("-no6thSlotCompression"); + const Arg *CPUArg = Args.getLastArg(options::OPT_mcpu_EQ); + if (CPUArg) + CmdArgs.push_back( + Args.MakeArgString("-cv:" + StringRef(CPUArg->getValue()))); + CmdArgs.push_back("-noSPrefixing"); + CmdArgs.push_back("-a"); // Mystery option. + Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); + for (const Arg *A : Args.filtered(options::OPT_I, options::OPT_isystem)) { + A->claim(); + CmdArgs.push_back( + Args.MakeArgString(std::string("-i:") + A->getValue(0))); + } + CmdArgs.push_back(II.getFilename()); + CmdArgs.push_back( + Args.MakeArgString(std::string("-o:") + Output.getFilename())); + + std::string Exec = + Args.MakeArgString(getToolChain().GetProgramPath("moviAsm")); + C.addCommand(std::make_unique(JA, *this, ResponseFileSupport::None(), + Args.MakeArgString(Exec), CmdArgs, + Inputs, Output)); +} + +void tools::Myriad::Linker::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + const auto &TC = + static_cast(getToolChain()); + const llvm::Triple &T = TC.getTriple(); + ArgStringList CmdArgs; + bool UseStartfiles = + !Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles); + bool UseDefaultLibs = + !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs); + // Silence warning if the args contain both -nostdlib and -stdlib=. + Args.getLastArg(options::OPT_stdlib_EQ); + + if (T.getArch() == llvm::Triple::sparc) + CmdArgs.push_back("-EB"); + else // SHAVE assumes little-endian, and sparcel is expressly so. + CmdArgs.push_back("-EL"); + + // The remaining logic is mostly like gnutools::Linker::ConstructJob, + // but we never pass through a --sysroot option and various other bits. + // For example, there are no sanitizers (yet) nor gold linker. + + // Eat some arguments that may be present but have no effect. + Args.ClaimAllArgs(options::OPT_g_Group); + Args.ClaimAllArgs(options::OPT_w); + Args.ClaimAllArgs(options::OPT_static_libgcc); + + if (Args.hasArg(options::OPT_s)) // Pass the 'strip' option. + CmdArgs.push_back("-s"); + + CmdArgs.push_back("-o"); + CmdArgs.push_back(Output.getFilename()); + + if (UseStartfiles) { + // If you want startfiles, it means you want the builtin crti and crtbegin, + // but not crt0. Myriad link commands provide their own crt0.o as needed. + CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath("crti.o"))); + CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath("crtbegin.o"))); + } + + Args.AddAllArgs(CmdArgs, + {options::OPT_L, options::OPT_T_Group, options::OPT_s, + options::OPT_t, options::OPT_Z_Flag, options::OPT_r}); + + TC.AddFilePathLibArgs(Args, CmdArgs); + + bool NeedsSanitizerDeps = addSanitizerRuntimes(TC, Args, CmdArgs); + AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA); + + TC.AddTapirRuntimeLibArgs(Args, CmdArgs); + + if (UseDefaultLibs) { + if (NeedsSanitizerDeps) + linkSanitizerRuntimeDeps(TC, CmdArgs); + if (C.getDriver().CCCIsCXX()) { + if (TC.GetCXXStdlibType(Args) == ToolChain::CST_Libcxx) { + CmdArgs.push_back("-lc++"); + CmdArgs.push_back("-lc++abi"); + } else + CmdArgs.push_back("-lstdc++"); + } + if (T.getOS() == llvm::Triple::RTEMS) { + CmdArgs.push_back("--start-group"); + CmdArgs.push_back("-lc"); + CmdArgs.push_back("-lgcc"); // circularly dependent on rtems + // You must provide your own "-L" option to enable finding these. + CmdArgs.push_back("-lrtemscpu"); + CmdArgs.push_back("-lrtemsbsp"); + CmdArgs.push_back("--end-group"); + } else { + CmdArgs.push_back("-lc"); + CmdArgs.push_back("-lgcc"); + } + } + if (UseStartfiles) { + CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath("crtend.o"))); + CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath("crtn.o"))); + } + + std::string Exec = + Args.MakeArgString(TC.GetProgramPath("sparc-myriad-rtems-ld")); + C.addCommand(std::make_unique( + JA, *this, ResponseFileSupport::AtFileCurCP(), Args.MakeArgString(Exec), + CmdArgs, Inputs, Output)); +} + +MyriadToolChain::MyriadToolChain(const Driver &D, const llvm::Triple &Triple, + const ArgList &Args) + : Generic_ELF(D, Triple, Args) { + // If a target of 'sparc-myriad-elf' is specified to clang, it wants to use + // 'sparc-myriad--elf' (note the unknown OS) as the canonical triple. + // This won't work to find gcc. Instead we give the installation detector an + // extra triple, which is preferable to further hacks of the logic that at + // present is based solely on getArch(). In particular, it would be wrong to + // choose the myriad installation when targeting a non-myriad sparc install. + switch (Triple.getArch()) { + default: + D.Diag(clang::diag::err_target_unsupported_arch) + << Triple.getArchName() << "myriad"; + [[fallthrough]]; + case llvm::Triple::shave: + return; + case llvm::Triple::sparc: + case llvm::Triple::sparcel: + GCCInstallation.init(Triple, Args, {"sparc-myriad-rtems"}); + } + + if (GCCInstallation.isValid()) { + // This directory contains crt{i,n,begin,end}.o as well as libgcc. + // These files are tied to a particular version of gcc. + SmallString<128> CompilerSupportDir(GCCInstallation.getInstallPath()); + addPathIfExists(D, CompilerSupportDir, getFilePaths()); + } + // libstd++ and libc++ must both be found in this one place. + addPathIfExists(D, D.Dir + "/../sparc-myriad-rtems/lib", getFilePaths()); +} + +MyriadToolChain::~MyriadToolChain() {} + +void MyriadToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, + ArgStringList &CC1Args) const { + if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + addSystemInclude(DriverArgs, CC1Args, getDriver().SysRoot + "/include"); +} + +void MyriadToolChain::addLibCxxIncludePaths( + const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const { + std::string Path(getDriver().getInstalledDir()); + addSystemInclude(DriverArgs, CC1Args, Path + "/../include/c++/v1"); +} + +void MyriadToolChain::addLibStdCxxIncludePaths( + const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const { + StringRef LibDir = GCCInstallation.getParentLibPath(); + const GCCVersion &Version = GCCInstallation.getVersion(); + StringRef TripleStr = GCCInstallation.getTriple().str(); + const Multilib &Multilib = GCCInstallation.getMultilib(); + addLibStdCXXIncludePaths( + LibDir.str() + "/../" + TripleStr.str() + "/include/c++/" + Version.Text, + TripleStr, Multilib.includeSuffix(), DriverArgs, CC1Args); +} + +// MyriadToolChain handles several triples: +// {shave,sparc{,el}}-myriad-{rtems,unknown}-elf +Tool *MyriadToolChain::SelectTool(const JobAction &JA) const { + // The inherited method works fine if not targeting the SHAVE. + if (!isShaveCompilation(getTriple())) + return ToolChain::SelectTool(JA); + switch (JA.getKind()) { + case Action::PreprocessJobClass: + case Action::CompileJobClass: + if (!Compiler) + Compiler.reset(new tools::SHAVE::Compiler(*this)); + return Compiler.get(); + case Action::AssembleJobClass: + if (!Assembler) + Assembler.reset(new tools::SHAVE::Assembler(*this)); + return Assembler.get(); + default: + return ToolChain::getTool(JA.getKind()); + } +} + +Tool *MyriadToolChain::buildLinker() const { + return new tools::Myriad::Linker(*this); +} + +SanitizerMask MyriadToolChain::getSupportedSanitizers() const { + return SanitizerKind::Address; +} diff --git a/clang/lib/Driver/ToolChains/NaCl.cpp b/clang/lib/Driver/ToolChains/NaCl.cpp index 22f038e5152ff23..4c4f785f071fe39 100644 --- a/clang/lib/Driver/ToolChains/NaCl.cpp +++ b/clang/lib/Driver/ToolChains/NaCl.cpp @@ -127,6 +127,8 @@ void nacltools::Linker::ConstructJob(Compilation &C, const JobAction &JA, AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA); + ToolChain.AddTapirRuntimeLibArgs(Args, CmdArgs); + if (D.CCCIsCXX() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { if (ToolChain.ShouldLinkCXXStdlib(Args)) { diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp index 974e486a0082bce..d465f46fd783b67 100644 --- a/clang/lib/Driver/ToolChains/PS4CPU.cpp +++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp @@ -206,6 +206,8 @@ void tools::PS4cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA); + TC.AddTapirRuntimeLibArgs(Args, CmdArgs); + if (Args.hasArg(options::OPT_pthread)) { CmdArgs.push_back("-lpthread"); } diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp index e82ed2ca79ffd69..601297672633f67 100644 --- a/clang/lib/Driver/ToolChains/Solaris.cpp +++ b/clang/lib/Driver/ToolChains/Solaris.cpp @@ -206,6 +206,8 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA, bool NeedsSanitizerDeps = addSanitizerRuntimes(ToolChain, Args, CmdArgs); AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA); + getToolChain().AddTapirRuntimeLibArgs(Args, CmdArgs); + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs, options::OPT_r)) { // Use the static OpenMP runtime with -static-openmp diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 028fdb2cc6b9dac..38fe5aaec810231 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -30,6 +30,7 @@ #include "clang/Driver/Driver.h" #include "clang/Driver/DriverDiagnostic.h" #include "clang/Driver/Options.h" +#include "clang/Driver/Tapir.h" #include "clang/Frontend/CommandLineSourceLoc.h" #include "clang/Frontend/DependencyOutputOptions.h" #include "clang/Frontend/FrontendDiagnostic.h" @@ -86,6 +87,7 @@ #include "llvm/Target/TargetOptions.h" #include "llvm/TargetParser/Host.h" #include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Tapir/TapirTargetIDs.h" #include #include #include @@ -1551,6 +1553,10 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts, else if (!Opts.DirectAccessExternalData && LangOpts->PICLevel == 0) GenerateArg(Consumer, OPT_fno_direct_access_external_data); + if (std::optional TapirTargetStr = + serializeTapirTarget(Opts.getTapirTarget())) + GenerateArg(Args, OPT_ftapir_EQ, *TapirTargetStr, SA); + std::optional DebugInfoVal; switch (Opts.DebugInfo) { case llvm::codegenoptions::DebugLineTablesOnly: @@ -1855,6 +1861,14 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, Opts.setDebugInfo(llvm::codegenoptions::LimitedDebugInfo); } + // Parse Tapir-related codegen options. + TapirTargetID TapirTarget = parseTapirTarget(Args); + if (TapirTarget == TapirTargetID::Last_TapirTargetID) + if (const Arg *A = Args.getLastArg(OPT_ftapir_EQ)) + Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) + << A->getValue(); + Opts.setTapirTarget(TapirTarget); + for (const auto &Arg : Args.getAllArgValues(OPT_fdebug_prefix_map_EQ)) { auto Split = StringRef(Arg).split('='); Opts.DebugPrefixMap.emplace_back(Split.first, Split.second); @@ -3552,7 +3566,11 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts, if (Opts.PIE) GenerateArg(Consumer, OPT_pic_is_pie); for (StringRef Sanitizer : serializeSanitizerKinds(Opts.Sanitize)) +<<<<<<< HEAD GenerateArg(Consumer, OPT_fsanitize_EQ, Sanitizer); +======= + GenerateArg(Args, OPT_fsanitize_EQ, Sanitizer, SA); +>>>>>>> 01b251155a72 ( Remove a lot of Cilk-specific code. More can probably be removed. check-clang,) return; } diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp index cc6f18b5b319f95..4c53cd191d6ac40 100644 --- a/clang/lib/Parse/ParsePragma.cpp +++ b/clang/lib/Parse/ParsePragma.cpp @@ -213,6 +213,7 @@ struct PragmaSupportHandler : public PragmaHandler { Token &FirstToken) override; }; +<<<<<<< HEAD struct PragmaOpenMPHandler : public PragmaSupportHandler>>>>>> 01b251155a72 ( Remove a lot of Cilk-specific code. More can probably be removed. check-clang,) /// PragmaCommentHandler - "\#pragma comment ...". struct PragmaCommentHandler : public PragmaHandler { PragmaCommentHandler(Sema &Actions) diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp index 7f452d177c16f0b..d2c520accafc454 100644 --- a/clang/lib/Sema/SemaStmtAttr.cpp +++ b/clang/lib/Sema/SemaStmtAttr.cpp @@ -464,6 +464,9 @@ CheckForIncompatibleAttributes(Sema &S, // The vector predication only has a state form that is exposed by // #pragma clang loop vectorize_predicate (enable | disable). VectorizePredicate, + // The Tapir grainsize only has a numeric form that describes the + // amount to coarsen the parallel loop. + TapirGrainsize, // This serves as a indicator to how many category are listed in this enum. NumberOfCategories }; @@ -508,6 +511,9 @@ CheckForIncompatibleAttributes(Sema &S, case LoopHintAttr::PipelineInitiationInterval: Category = Pipeline; break; + case LoopHintAttr::TapirGrainsize: + Category = TapirGrainsize; + break; case LoopHintAttr::VectorizePredicate: Category = VectorizePredicate; break; diff --git a/clang/test/Sema/builtin-longjmp.c b/clang/test/Sema/builtin-longjmp.c index 99463cf3385a1eb..0de2cb96988c99c 100644 --- a/clang/test/Sema/builtin-longjmp.c +++ b/clang/test/Sema/builtin-longjmp.c @@ -4,8 +4,8 @@ // RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm < %s| FileCheck %s // RUN: %clang_cc1 -triple powerpc64-unknown-unknown -emit-llvm < %s| FileCheck %s // RUN: %clang_cc1 -triple ve-unknown-unknown -emit-llvm < %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm < %s | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm-only -verify %s // RUN: %clang_cc1 -triple mips-unknown-unknown -emit-llvm-only -verify %s // RUN: %clang_cc1 -triple mips64-unknown-unknown -emit-llvm-only -verify %s // RUN: %clang_cc1 -triple sparc-eabi-unknown -emit-llvm-only -verify %s diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt index 290bf2a42536ddf..30994bf21c681eb 100644 --- a/clang/tools/driver/CMakeLists.txt +++ b/clang/tools/driver/CMakeLists.txt @@ -13,6 +13,7 @@ set( LLVM_LINK_COMPONENTS Option ScalarOpts Support + TapirOpts TargetParser TransformUtils Vectorize diff --git a/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake index f3c8fbe2c2fecdc..67a113b25b4febf 100644 --- a/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake +++ b/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake @@ -329,6 +329,10 @@ macro(darwin_add_builtin_library name suffix) set_target_properties(${libname} PROPERTIES OSX_ARCHITECTURES ${LIB_ARCH}) + if (${arch} STREQUAL arm64 OR ${arch} STREQUAL arm64e) + add_dependencies(${libname} outline_atomic_helpers) + endif() + if(LIB_PARENT_TARGET) add_dependencies(${LIB_PARENT_TARGET} ${libname}) endif() diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index e0b2d08c207754b..412811faad746ef 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -589,6 +589,7 @@ if (COMPILER_RT_HAS_AARCH64_SME) endif() # Generate outline atomics helpers from lse.S base +set(atomic_helpers) set(OA_HELPERS_DIR "${CMAKE_CURRENT_BINARY_DIR}/outline_atomic_helpers.dir") file(MAKE_DIRECTORY "${OA_HELPERS_DIR}") @@ -614,12 +615,15 @@ foreach(pat cas swp ldadd ldclr ldeor ldset) COMPILE_DEFINITIONS "L_${pat};SIZE=${size};MODEL=${model}" INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}" ) + list(APPEND atomic_helpers "${helper_asm}") list(APPEND aarch64_SOURCES "${helper_asm}") endif() endforeach(model) endforeach(size) endforeach(pat) +add_custom_target(outline_atomic_helpers DEPENDS ${atomic_helpers}) + if (MINGW) set(aarch64_SOURCES ${aarch64_SOURCES} diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h index 947f3fead54e037..526485e4f952dc3 100644 --- a/lld/COFF/Config.h +++ b/lld/COFF/Config.h @@ -17,6 +17,7 @@ #include "llvm/Object/COFF.h" #include "llvm/Support/CachePruning.h" #include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Transforms/Tapir/TapirTargetIDs.h" #include #include #include @@ -234,6 +235,10 @@ struct Configuration { // Used for /mapinfo. bool mapInfo = false; + // Used for Tapir target. + llvm::StringRef opencilkABIBitcodeFile; + llvm::TapirTargetID tapirTarget = llvm::TapirTargetID::None; + // Used for /thinlto-index-only: llvm::StringRef thinLTOIndexOnlyArg; diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 9e28b1c50be5049..4f61749e3d812cc 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -811,6 +811,7 @@ static std::string createResponseFile(const opt::InputArgList &args, case OPT_deffile: case OPT_manifestinput: case OPT_natvis: + case OPT_opencilk_abi_bitcode: os << arg->getSpelling() << quote(rewritePath(arg->getValue())) << '\n'; break; case OPT_order: { @@ -2055,6 +2056,11 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { if (args.hasFlag(OPT_inferasanlibs, OPT_inferasanlibs_no, false)) warn("ignoring '/inferasanlibs', this flag is not supported"); + config->opencilkABIBitcodeFile = + args.getLastArgValue(OPT_opencilk_abi_bitcode); + config->tapirTarget = + args::parseTapirTarget(args.getLastArgValue(OPT_tapir_target)); + if (config->incremental && args.hasArg(OPT_profile)) { warn("ignoring '/incremental' due to '/profile' specification"); config->incremental = false; diff --git a/lld/COFF/LTO.cpp b/lld/COFF/LTO.cpp index 5c881bc01c663d5..899fa160109357d 100644 --- a/lld/COFF/LTO.cpp +++ b/lld/COFF/LTO.cpp @@ -83,6 +83,9 @@ lto::Config BitcodeCompiler::createConfig() { c.CGOptLevel = *optLevelOrNone; c.AlwaysEmitRegularLTOObj = !ctx.config.ltoObjPath.empty(); c.DebugPassManager = ctx.config.ltoDebugPassManager; + if (args::validTapirTarget(ctx.config.tapirTarget)) + c.TapirTarget = ctx.config.tapirTarget; + c.OpenCilkABIBitcodeFile = std::string(ctx.config.opencilkABIBitcodeFile); c.CSIRProfile = std::string(ctx.config.ltoCSProfileFile); c.RunCSIRInstr = ctx.config.ltoCSProfileGenerate; c.PGOWarnMismatch = ctx.config.ltoPGOWarnMismatch; diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td index 1e78a560bca8648..f85ce9755cbc802 100644 --- a/lld/COFF/Options.td +++ b/lld/COFF/Options.td @@ -240,6 +240,9 @@ defm lld_allow_duplicate_weak : B_priv<"lld-allow-duplicate-weak">; def lldemit : P<"lldemit", "Specify output type">; def lldmingw : F<"lldmingw">; def noseh : F<"noseh">; +def opencilk_abi_bitcode : P< + "opencilk-abi-bitcode", + "Path to OpenCilk ABI bitcode file">; def osversion : P_priv<"osversion">; def output_def : Joined<["/", "-", "/?", "-?"], "output-def:">; def pdb_source_path : P<"pdbsourcepath", @@ -249,6 +252,7 @@ def rsp_quoting : Joined<["--"], "rsp-quoting=">, def start_lib : F<"start-lib">, HelpText<"Start group of objects treated as if they were in a library">; defm stdcall_fixup : B_priv<"stdcall-fixup">; +def tapir_target : P<"tapir-target", "Specify the target for Tapir lowering">; def thinlto_emit_imports_files : F<"thinlto-emit-imports-files">, HelpText<"Emit .imports files with -thinlto-index-only">; diff --git a/lld/Common/Args.cpp b/lld/Common/Args.cpp index 5546b2aece641e2..2aae93beed5cd5e 100644 --- a/lld/Common/Args.cpp +++ b/lld/Common/Args.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" @@ -90,3 +91,19 @@ StringRef lld::args::getFilenameWithoutExe(StringRef path) { return sys::path::stem(path); return sys::path::filename(path); } + +TapirTargetID lld::args::parseTapirTarget(StringRef tapirTarget) { + return llvm::StringSwitch(tapirTarget) + .Case("none", TapirTargetID::None) + .Case("serial", TapirTargetID::Serial) + .Case("cheetah", TapirTargetID::Cheetah) + .Case("lambda", TapirTargetID::Lambda) + .Case("omptask", TapirTargetID::OMPTask) + .Case("opencilk", TapirTargetID::OpenCilk) + .Case("qthreads", TapirTargetID::Qthreads) + .Default(TapirTargetID::Last_TapirTargetID); +} + +bool lld::args::validTapirTarget(TapirTargetID TargetID) { + return TargetID < TapirTargetID::Last_TapirTargetID; +} diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 28726d48e428420..21f36098f28931b 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -27,6 +27,7 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Support/GlobPattern.h" #include "llvm/Support/PrettyStackTrace.h" +#include "llvm/Transforms/Tapir/TapirTargetIDs.h" #include #include #include @@ -173,6 +174,7 @@ struct Config { llvm::StringRef ltoSampleProfile; llvm::StringRef mapFile; llvm::StringRef outputFile; + llvm::StringRef opencilkABIBitcodeFile; llvm::StringRef optRemarksFilename; std::optional optRemarksHotnessThreshold = 0; llvm::StringRef optRemarksPasses; @@ -183,6 +185,7 @@ struct Config { llvm::StringRef printSymbolOrder; llvm::StringRef soName; llvm::StringRef sysroot; + llvm::TapirTargetID tapirTarget = llvm::TapirTargetID::None; llvm::StringRef thinLTOCacheDir; llvm::StringRef thinLTOIndexOnlyArg; llvm::StringRef whyExtract; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index eb6734dfd458d5b..29ebd2679f045b5 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -1360,6 +1360,8 @@ static void readConfigs(opt::InputArgList &args) { config->nostdlib = args.hasArg(OPT_nostdlib); config->oFormatBinary = isOutputFormatBinary(args); config->omagic = args.hasFlag(OPT_omagic, OPT_no_omagic, false); + config->opencilkABIBitcodeFile = + args.getLastArgValue(OPT_opencilk_abi_bitcode); config->optRemarksFilename = args.getLastArgValue(OPT_opt_remarks_filename); config->optStatsFilename = args.getLastArgValue(OPT_plugin_opt_stats_file); @@ -1420,6 +1422,8 @@ static void readConfigs(opt::InputArgList &args) { config->splitStackAdjustSize = args::getInteger(args, OPT_split_stack_adjust_size, 16384); config->strip = getStrip(args); config->sysroot = args.getLastArgValue(OPT_sysroot); + config->tapirTarget = + args::parseTapirTarget(args.getLastArgValue(OPT_tapir_target)); config->target1Rel = args.hasFlag(OPT_target1_rel, OPT_target1_abs, false); config->target2 = getTarget2(args); config->thinLTOCacheDir = args.getLastArgValue(OPT_thinlto_cache_dir); diff --git a/lld/ELF/DriverUtils.cpp b/lld/ELF/DriverUtils.cpp index ac74604408152d6..f058352f19abb6d 100644 --- a/lld/ELF/DriverUtils.cpp +++ b/lld/ELF/DriverUtils.cpp @@ -191,6 +191,7 @@ std::string elf::createResponseFile(const opt::InputArgList &args) { case OPT_export_dynamic_symbol_list: case OPT_just_symbols: case OPT_library_path: + case OPT_opencilk_abi_bitcode: case OPT_remap_inputs_file: case OPT_retain_symbols_file: case OPT_rpath: diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp index 935d0a9eab9ee06..8cd46c4fe4b149a 100644 --- a/lld/ELF/LTO.cpp +++ b/lld/ELF/LTO.cpp @@ -130,6 +130,10 @@ static lto::Config createConfig() { c.DebugPassManager = config->ltoDebugPassManager; c.DwoDir = std::string(config->dwoDir); + if (args::validTapirTarget(config->tapirTarget)) + c.TapirTarget = config->tapirTarget; + c.OpenCilkABIBitcodeFile = std::string(config->opencilkABIBitcodeFile); + c.HasWholeProgramVisibility = config->ltoWholeProgramVisibility; c.ValidateAllVtablesHaveTypeInfos = config->ltoValidateAllVtablesHaveTypeInfos; diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index 74733efb28ff5e5..a72c9678a7cfb89 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -644,6 +644,7 @@ defm lto_whole_program_visibility: BB<"lto-whole-program-visibility", "Asserts that the LTO link does not have whole program visibility">; def disable_verify: F<"disable-verify">; defm mllvm: Eq<"mllvm", "Additional arguments to forward to LLVM's option processing">; +defm opencilk_abi_bitcode: EEq<"opencilk-abi-bitcode", "Path to OpenCilk ABI bitcode file">; def opt_remarks_filename: Separate<["--"], "opt-remarks-filename">, HelpText<"YAML output file for optimization remarks">; defm opt_remarks_hotness_threshold: EEq<"opt-remarks-hotness-threshold", @@ -671,6 +672,7 @@ defm shuffle_sections: EEq<"shuffle-sections", "Shuffle matched sections using the given seed before mapping them to the output sections. " "If -1, reverse the section order. If 0, use a random seed">, MetaVarName<"=">; +defm tapir_target: Eq<"tapir-target", "Specify the target for Tapir lowering">; def thinlto_cache_dir: JJ<"thinlto-cache-dir=">, HelpText<"Path to ThinLTO cached object file directory">; defm thinlto_cache_policy: EEq<"thinlto-cache-policy", "Pruning policy for the ThinLTO cache">; @@ -709,6 +711,9 @@ def: J<"plugin-opt=cs-profile-path=">, def: J<"plugin-opt=obj-path=">, Alias, HelpText<"Alias for --lto-obj-path=">; +def: J<"plugin-opt=opencilk-abi-bitcode=">, + Alias, + HelpText<"Alias for --opencilk-abi-bitcode">; def: J<"plugin-opt=opt-remarks-filename=">, Alias, HelpText<"Alias for --opt-remarks-filename">; @@ -729,6 +734,9 @@ def: J<"plugin-opt=sample-profile=">, def: F<"plugin-opt=save-temps">, Alias, HelpText<"Alias for --save-temps">; def plugin_opt_stats_file: J<"plugin-opt=stats-file=">, HelpText<"Filename to write LTO statistics to">; +def: J<"plugin-opt=tapir-target=">, + Alias, + HelpText<"Alias for --tapir-target=">; def: F<"plugin-opt=thinlto-emit-imports-files">, Alias, HelpText<"Alias for --thinlto-emit-imports-files">; diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h index e79812b16ec1282..097a9da130232bb 100644 --- a/lld/MachO/Config.h +++ b/lld/MachO/Config.h @@ -23,6 +23,7 @@ #include "llvm/TextAPI/Architecture.h" #include "llvm/TextAPI/Platform.h" #include "llvm/TextAPI/Target.h" +#include "llvm/Transforms/Tapir/TapirTargetIDs.h" #include @@ -238,6 +239,9 @@ struct Configuration { std::vector dyldEnvs; + llvm::TapirTargetID tapirTarget = llvm::TapirTargetID::None; + llvm::StringRef opencilkABIBitcodeFile; + llvm::MachO::Architecture arch() const { return platformInfo.target.Arch; } llvm::MachO::PlatformType platform() const { diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index 1883acc781ecef8..cb9ac9ae420737e 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -1749,6 +1749,10 @@ bool link(ArrayRef argsArr, llvm::raw_ostream &stdoutOS, args.hasFlag(OPT_warn_thin_archive_missing_members, OPT_no_warn_thin_archive_missing_members, true); config->generateUuid = !args.hasArg(OPT_no_uuid); + config->tapirTarget = + args::parseTapirTarget(args.getLastArgValue(OPT_tapir_target)); + config->opencilkABIBitcodeFile = + args.getLastArgValue(OPT_opencilk_abi_bitcode); for (const Arg *arg : args.filtered(OPT_alias)) { config->aliasedSymbols.push_back( diff --git a/lld/MachO/LTO.cpp b/lld/MachO/LTO.cpp index 7a9a9223a03227f..f5b56a9eb5ce639 100644 --- a/lld/MachO/LTO.cpp +++ b/lld/MachO/LTO.cpp @@ -62,6 +62,9 @@ static lto::Config createConfig() { c.PGOWarnMismatch = config->pgoWarnMismatch; c.OptLevel = config->ltoo; c.CGOptLevel = config->ltoCgo; + if (args::validTapirTarget(config->tapirTarget)) + c.TapirTarget = config->tapirTarget; + c.OpenCilkABIBitcodeFile = std::string(config->opencilkABIBitcodeFile); if (config->saveTemps) checkError(c.addSaveTemps(config->outputFile.str() + ".", /*UseInputModulePath=*/true)); diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td index bbd8bf70c3a0c54..3e2e6b6a54bceb7 100644 --- a/lld/MachO/Options.td +++ b/lld/MachO/Options.td @@ -156,6 +156,12 @@ defm pgo_warn_mismatch: BB<"pgo-warn-mismatch", defm warn_thin_archive_missing_members : BB<"warn-thin-archive-missing-members", "Warn on missing object files referenced by thin archives (default)", "Do not warn on missing object files referenced by thin archives">, Group; +def tapir_target: Joined<["--"], "tapir-target=">, + HelpText<"Specify the target for Tapir lowering">, + Group; +def opencilk_abi_bitcode: Joined<["--"], "opencilk-abi-bitcode=">, + HelpText<"Path to the OpenCilk ABI bitcode file">, + Group; // This is a complete Options.td compiled from Apple's ld(1) manpage // dated 2018-03-07 and cross checked with ld64 source code in repo diff --git a/lld/include/lld/Common/Args.h b/lld/include/lld/Common/Args.h index 60f83fbbbf1a3c9..f0e65c493b996f1 100644 --- a/lld/include/lld/Common/Args.h +++ b/lld/include/lld/Common/Args.h @@ -12,6 +12,7 @@ #include "lld/Common/LLVM.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Transforms/Tapir/TapirTargetIDs.h" #include namespace llvm { @@ -40,6 +41,10 @@ std::vector getLines(MemoryBufferRef mb); StringRef getFilenameWithoutExe(StringRef path); +llvm::TapirTargetID parseTapirTarget(StringRef tapirTarget); + +bool validTapirTarget(llvm::TapirTargetID TargetID); + } // namespace args } // namespace lld diff --git a/llvm/.gitmodules b/llvm/.gitmodules new file mode 100644 index 000000000000000..e69de29bb2d1d64 diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 12618966c4adfd2..53033880a65b602 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -19,11 +19,39 @@ include(${LLVM_COMMON_CMAKE_UTILS}/Modules/LLVMVersion.cmake) set_directory_properties(PROPERTIES LLVM_VERSION_MAJOR "${LLVM_VERSION_MAJOR}") +if(NOT DEFINED OPENCILK_VERSION_MAJOR) + set(OPENCILK_VERSION_MAJOR 2) +endif() +if(NOT DEFINED OPENCILK_VERSION_MINOR) + set(OPENCILK_VERSION_MINOR 0) +endif() +if(NOT DEFINED OPENCILK_VERSION_PATCH) + set(OPENCILK_VERSION_PATCH 0) +endif() +if(NOT DEFINED OPENCILK_VERSION_SUFFIX) + set(OPENCILK_VERSION_SUFFIX) +endif() + +if(NOT DEFINED TAPIR_VERSION_MAJOR) + set(TAPIR_VERSION_MAJOR 1) +endif() +if(NOT DEFINED TAPIR_VERSION_MINOR) + set(TAPIR_VERSION_MINOR 0) +endif() +if(NOT DEFINED TAPIR_VERSION_PATCH) + set(TAPIR_VERSION_PATCH 0) +endif() + if (NOT PACKAGE_VERSION) set(PACKAGE_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX}") endif() +if (NOT OPENCILK_PACKAGE_VERSION) + set(OPENCILK_PACKAGE_VERSION + "${OPENCILK_VERSION_MAJOR}.${OPENCILK_VERSION_MINOR}.${OPENCILK_VERSION_PATCH}${OPENCILK_VERSION_SUFFIX}") +endif() + if(NOT DEFINED LLVM_SHLIB_SYMBOL_VERSION) # "Symbol version prefix for libLLVM.so" set(LLVM_SHLIB_SYMBOL_VERSION "LLVM_${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}") @@ -114,7 +142,7 @@ endif() # LLVM_EXTERNAL_${project}_SOURCE_DIR using LLVM_ALL_PROJECTS # This allows an easy way of setting up a build directory for llvm and another # one for llvm+clang+... using the same sources. -set(LLVM_ALL_PROJECTS "bolt;clang;clang-tools-extra;compiler-rt;cross-project-tests;libc;libclc;lld;lldb;mlir;openmp;polly;pstl") +set(LLVM_ALL_PROJECTS "bolt;cheetah;cilktools;clang;clang-tools-extra;compiler-rt;cross-project-tests;libc;libclc;lld;lldb;mlir;openmp;polly;pstl") # The flang project is not yet part of "all" projects (see C++ requirements) set(LLVM_EXTRA_PROJECTS "flang") # List of all known projects in the mono repo @@ -362,7 +390,7 @@ option(LLVM_TOOL_LLVM_DRIVER_BUILD "Enables building the llvm multicall tool" OF set(PACKAGE_NAME LLVM) set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}") -set(PACKAGE_BUGREPORT "https://github.com/llvm/llvm-project/issues/") +set(PACKAGE_BUGREPORT "https://github.com/OpenCilk/opencilk-project/issues/") set(BUG_REPORT_URL "${PACKAGE_BUGREPORT}" CACHE STRING "Default URL where bug reports are to be submitted.") @@ -370,17 +398,20 @@ set(LLDB_BUG_REPORT_URL "${BUG_REPORT_URL}" CACHE STRING "Default URL where lldb bug reports are to be submitted.") # Configure CPack. +if(NOT DEFINED CPACK_PACKAGE_NAME) + set(CPACK_PACKAGE_NAME "OpenCilk") +endif() if(NOT DEFINED CPACK_PACKAGE_INSTALL_DIRECTORY) set(CPACK_PACKAGE_INSTALL_DIRECTORY "LLVM") endif() if(NOT DEFINED CPACK_PACKAGE_VENDOR) - set(CPACK_PACKAGE_VENDOR "LLVM") + set(CPACK_PACKAGE_VENDOR "OpenCilk") endif() -set(CPACK_PACKAGE_VERSION_MAJOR ${LLVM_VERSION_MAJOR}) -set(CPACK_PACKAGE_VERSION_MINOR ${LLVM_VERSION_MINOR}) -set(CPACK_PACKAGE_VERSION_PATCH ${LLVM_VERSION_PATCH}) -set(CPACK_PACKAGE_VERSION ${PACKAGE_VERSION}) -set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.TXT") +set(CPACK_PACKAGE_VERSION_MAJOR ${OPENCILK_VERSION_MAJOR}) +set(CPACK_PACKAGE_VERSION_MINOR ${OPENCILK_VERSION_MINOR}) +set(CPACK_PACKAGE_VERSION_PATCH ${OPENCILK_VERSION_PATCH}) +set(CPACK_PACKAGE_VERSION ${OPENCILK_PACKAGE_VERSION}) +set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/../MIT_LICENSE.TXT") if(WIN32 AND NOT UNIX) set(CPACK_NSIS_COMPRESSOR "/SOLID lzma \r\n SetCompressorDictSize 32") if(NOT DEFINED CPACK_PACKAGE_INSTALL_REGISTRY_KEY) @@ -1238,7 +1269,7 @@ if( LLVM_INCLUDE_UTILS ) if( LLVM_INCLUDE_TESTS ) set(LLVM_SUBPROJECT_TITLE "Third-Party/Google Test") add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest) - set(LLVM_SUBPROJECT_TITLE) + set(LLVM_SUBPROJECT_TITLE) endif() else() if ( LLVM_INCLUDE_TESTS ) diff --git a/llvm/CODE_OWNERS.TXT b/llvm/CODE_OWNERS.TXT index d1620d1cbf870e3..31e546665b9bbd4 100644 --- a/llvm/CODE_OWNERS.TXT +++ b/llvm/CODE_OWNERS.TXT @@ -212,6 +212,10 @@ N: Duncan Sands E: baldrick@free.fr D: DragonEgg +N: Tao B. Schardl +E: neboat@mit.edu +D: Tapir, CSI and CilkSanitizer instrumentation passes + N: Mark Schimmel E: marksl@synopsys.com D: ARC backend (lib/Target/ARC/*) diff --git a/llvm/CREDITS.TXT b/llvm/CREDITS.TXT index a6f042779da2e11..fd278380f2bd1bc 100644 --- a/llvm/CREDITS.TXT +++ b/llvm/CREDITS.TXT @@ -74,6 +74,10 @@ N: Brendon Cahoon E: bcahoon@codeaurora.org D: Loop unrolling with run-time trip counts. +N: John Carr +E: jfc@mit.edu +D: OpenCilk frontend, Tapir lowering to OpenCilk, _Hyperobject type, Tapir function attributes + N: Chandler Carruth E: chandlerc@gmail.com E: chandlerc@google.com @@ -115,6 +119,10 @@ N: Anshuman Dasgupta E: adasgupt@codeaurora.org D: Deterministic finite automaton based infrastructure for VLIW packetization +N: Tyler Denniston +E: denniston.t@gmail.com +D: CSI instrumentation pass and runtime + N: Stefanus Du Toit E: stefanus.du.toit@intel.com D: Bug fixes and minor improvements @@ -310,6 +318,10 @@ W: https://apt.llvm.org/ D: Debian and Ubuntu packaging D: Continuous integration with jenkins +N: I-Ting Angelina Lee +E: angelee@wustl.edu +D: cilksan + N: Andrew Lenharth E: alenhar2@cs.uiuc.edu W: http://www.lenharth.org/~andrewl/ @@ -361,6 +373,10 @@ N: Scott Michel E: scottm@aero.org D: Added STI Cell SPU backend. +N: William S. Moses +E: wmoses@mit.edu +D: Tapir, Tapir lowering passes for Cilk and OpenMP + N: Kai Nacke E: kai@redstar.de D: Support for implicit TLS model used with MS VC runtime @@ -480,6 +496,11 @@ N: Alina Sbirlea E: alina.sbirlea@gmail.com D: MemorySSA, BatchAA, misc loop and new pass manager work. +N: Tao B. Schardl +E: neboat@mit.edu +D: Tapir, Cilk frontend, Tapir lowering passes for Cilk +D: CSI, cilksan, cilkscale + N: Arnold Schwaighofer E: arnold.schwaighofer@gmail.com D: Tail call optimization for the x86 backend @@ -532,6 +553,10 @@ E: lauro.venancio@indt.org.br D: ARM backend improvements D: Thread Local Storage implementation +N: Daniele Vettorel +E: vettoreldaniele@gmail.com +D: CSI modifications to support JIT compilation + N: Phoebe Wang E: phoebe.wang@intel.com D: X86 bug fixes and new instruction support. diff --git a/llvm/README.md b/llvm/README.md new file mode 100644 index 000000000000000..2bd1521d3574362 --- /dev/null +++ b/llvm/README.md @@ -0,0 +1,23 @@ +Tapir/LLVM +================================ + +This directory and its subdirectories contain source code for +Tapir/LLVM, a prototype compiler based on LLVM that implements the +Tapir compiler IR extensions for fork-join parallelism. + +Tapir/LLVM is under active development. This directory contains +prototype implementations of compiler technologies that take advantage +of the Tapir compiler IR. + +Tapir/LLVM is open source software. You may freely distribute it +under the terms of the license agreement found in LICENSE.txt. + +![](https://github.com/wsmoses/Tapir-LLVM/workflows/Tapir%20CI/badge.svg) + + +# References + +T. B. Schardl, W. S. Moses, C. E. Leiserson. "Tapir: Embedding +Fork-Join Parallelism into LLVM's Intermediate Representation." ACM +PPoPP, February 2017, pp. 249-265. Won Best Paper Award. +http://dl.acm.org/citation.cfm?id=3018758 diff --git a/llvm/README.txt b/llvm/README.txt deleted file mode 100644 index b9b71a3b6daff15..000000000000000 --- a/llvm/README.txt +++ /dev/null @@ -1,17 +0,0 @@ -The LLVM Compiler Infrastructure -================================ - -This directory and its subdirectories contain source code for LLVM, -a toolkit for the construction of highly optimized compilers, -optimizers, and runtime environments. - -LLVM is open source software. You may freely distribute it under the terms of -the license agreement found in LICENSE.txt. - -Please see the documentation provided in docs/ for further -assistance with LLVM, and in particular docs/GettingStarted.rst for getting -started with LLVM and docs/README.txt for an overview of LLVM's -documentation setup. - -If you are writing a package for LLVM, see docs/Packaging.rst for our -suggestions. diff --git a/llvm/WORKSPACE b/llvm/WORKSPACE new file mode 100644 index 000000000000000..920b03c8faf0e98 --- /dev/null +++ b/llvm/WORKSPACE @@ -0,0 +1 @@ +workspace( name = "llvm" ) diff --git a/llvm/bindings/ocaml/llvm/META.llvm.in b/llvm/bindings/ocaml/llvm/META.llvm.in index def5444b046e45a..23d9b74b215865b 100644 --- a/llvm/bindings/ocaml/llvm/META.llvm.in +++ b/llvm/bindings/ocaml/llvm/META.llvm.in @@ -53,6 +53,14 @@ package "irreader" ( archive(native) = "llvm_irreader.cmxa" ) +package "tapir_opts" ( + requires = "llvm" + version = "@PACKAGE_VERSION@" + description = "Tapir Transforms for LLVM" + archive(byte) = "llvm_tapir_opts.cma" + archive(native) = "llvm_tapir_opts.cmxa" +) + package "transform_utils" ( requires = "llvm" version = "@PACKAGE_VERSION@" diff --git a/llvm/bindings/ocaml/llvm/llvm.ml b/llvm/bindings/ocaml/llvm/llvm.ml index 86b010e0ac22daa..1318846f17a4c56 100644 --- a/llvm/bindings/ocaml/llvm/llvm.ml +++ b/llvm/bindings/ocaml/llvm/llvm.ml @@ -1332,6 +1332,15 @@ external build_icmp : Icmp.t -> llvalue -> llvalue -> string -> external build_fcmp : Fcmp.t -> llvalue -> llvalue -> string -> llbuilder -> llvalue = "llvm_build_fcmp" +(*--.. Parallel constructs .................................................--*) + +external build_detach : llbasicblock -> llbasicblock -> llvalue -> llbuilder -> + llvalue = "llvm_build_detach" +external build_reattach : llbasicblock -> llvalue -> llbuilder -> llvalue + = "llvm_build_reattach" +external build_sync : llbasicblock -> llvalue -> llbuilder -> llvalue + = "llvm_build_sync" + (*--... Miscellaneous instructions .........................................--*) external build_phi : (llvalue * llbasicblock) list -> string -> llbuilder -> llvalue = "llvm_build_phi" diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli index c16530d3a70cb43..6e7330deb1ddd9a 100644 --- a/llvm/bindings/ocaml/llvm/llvm.mli +++ b/llvm/bindings/ocaml/llvm/llvm.mli @@ -2419,6 +2419,27 @@ val build_fcmp : Fcmp.t -> llvalue -> llvalue -> string -> llbuilder -> llvalue +(** {Parallel constructs} *) + +(** [build_detach dbb cbb r b] creates a + [detach within %r, %dbb, %cbb] + instruction at the position specified by the instruction builder [b]. + See the method [llvm::LLVMBuilder::CreateDetach]. *) +val build_detach : llbasicblock -> llbasicblock -> llvalue -> llbuilder -> + llvalue + +(** [build_reattach bb r b] creates a + [reattach within %r, %bb] + instruction at the position specified by the instruction builder [b]. + See the method [llvm::LLVMBuilder::CreateReattach]. *) +val build_reattach : llbasicblock -> llvalue -> llbuilder -> llvalue + +(** [build_sync bb r b] creates a + [sync within %r, %bb] + instruction at the position specified by the instruction builder [b]. + See the method [llvm::LLVMBuilder::CreateSync]. *) +val build_sync : llbasicblock -> llvalue -> llbuilder -> llvalue + (** {7 Miscellaneous instructions} *) (** [build_phi incoming name b] creates a diff --git a/llvm/bindings/ocaml/llvm/llvm_ocaml.c b/llvm/bindings/ocaml/llvm/llvm_ocaml.c index 4ac824cd6a98a66..d3ef8ff87c369c9 100644 --- a/llvm/bindings/ocaml/llvm/llvm_ocaml.c +++ b/llvm/bindings/ocaml/llvm/llvm_ocaml.c @@ -2585,6 +2585,30 @@ value llvm_build_fcmp(value Pred, value LHS, value RHS, value Name, value B) { Value_val(RHS), String_val(Name))); } +/*--.. Parallel constructs .................................................--*/ + +/* llbasicblock -> llbasicblock -> llvalue -> llbuilder -> llvalue */ +CAMLprim LLVMValueRef llvm_build_detach(LLVMBasicBlockRef DetachBB, + LLVMBasicBlockRef ContinueBB, + LLVMValueRef SyncRegion, + LLVMBuilderRef B) { + return LLVMBuildDetach(Builder_val(B), DetachBB, ContinueBB, SyncRegion); +} + +/* llbasicblock -> llvalue -> llbuilder -> llvalue */ +CAMLprim LLVMValueRef llvm_build_reattach(LLVMBasicBlockRef ReattachBB, + LLVMValueRef SyncRegion, + LLVMBuilderRef B) { + return LLVMBuildReattach(Builder_val(B), ReattachBB, SyncRegion); +} + +/* llbasicblock -> llvalue -> llbuilder -> llvalue */ +CAMLprim LLVMValueRef llvm_build_sync(LLVMBasicBlockRef ContinueBB, + LLVMValueRef SyncRegion, + LLVMBuilderRef B) { + return LLVMBuildSync(Builder_val(B), ContinueBB, SyncRegion); +} + /*--... Miscellaneous instructions .........................................--*/ /* (llvalue * llbasicblock) list -> string -> llbuilder -> llvalue */ diff --git a/llvm/bindings/ocaml/transforms/CMakeLists.txt b/llvm/bindings/ocaml/transforms/CMakeLists.txt index 014c56285c59cf5..128a6cbf7f90570 100644 --- a/llvm/bindings/ocaml/transforms/CMakeLists.txt +++ b/llvm/bindings/ocaml/transforms/CMakeLists.txt @@ -1,3 +1,4 @@ add_subdirectory(passbuilder) +add_subdirectory(tapir_opts) add_subdirectory(utils) diff --git a/llvm/bindings/ocaml/transforms/tapir_opts/CMakeLists.txt b/llvm/bindings/ocaml/transforms/tapir_opts/CMakeLists.txt new file mode 100644 index 000000000000000..d315ca0c9ab660b --- /dev/null +++ b/llvm/bindings/ocaml/transforms/tapir_opts/CMakeLists.txt @@ -0,0 +1,5 @@ +add_ocaml_library(llvm_tapir_opts + OCAML llvm_tapir_opts + OCAMLDEP llvm + C tapir_opts_ocaml + LLVM tapiropts) diff --git a/llvm/bindings/ocaml/transforms/tapir_opts/llvm_tapir_opts.ml b/llvm/bindings/ocaml/transforms/tapir_opts/llvm_tapir_opts.ml new file mode 100644 index 000000000000000..1a12243265a7160 --- /dev/null +++ b/llvm/bindings/ocaml/transforms/tapir_opts/llvm_tapir_opts.ml @@ -0,0 +1,19 @@ +(*===-- llvm_tapir_opts.ml - LLVM OCaml Interface -------------*- OCaml -*-===* + * + * The LLVM Compiler Infrastructure + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + * + *===----------------------------------------------------------------------===*) + +(** Tapir pass to install Cilky (or other target-specific) stuff in place of + detach/sync instructions. *) +external add_lower_tapir_to_target : + [ `Module ] Llvm.PassManager.t -> unit + = "llvm_add_lower_tapir_to_target" + +(** Tapir pass to spawn loops with recursive divide-and-conquer. *) +external add_loop_spawning : + [ `Module ] Llvm.PassManager.t -> unit + = "llvm_add_loop_spawning" diff --git a/llvm/bindings/ocaml/transforms/tapir_opts/llvm_tapir_opts.mli b/llvm/bindings/ocaml/transforms/tapir_opts/llvm_tapir_opts.mli new file mode 100644 index 000000000000000..1a12243265a7160 --- /dev/null +++ b/llvm/bindings/ocaml/transforms/tapir_opts/llvm_tapir_opts.mli @@ -0,0 +1,19 @@ +(*===-- llvm_tapir_opts.ml - LLVM OCaml Interface -------------*- OCaml -*-===* + * + * The LLVM Compiler Infrastructure + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + * + *===----------------------------------------------------------------------===*) + +(** Tapir pass to install Cilky (or other target-specific) stuff in place of + detach/sync instructions. *) +external add_lower_tapir_to_target : + [ `Module ] Llvm.PassManager.t -> unit + = "llvm_add_lower_tapir_to_target" + +(** Tapir pass to spawn loops with recursive divide-and-conquer. *) +external add_loop_spawning : + [ `Module ] Llvm.PassManager.t -> unit + = "llvm_add_loop_spawning" diff --git a/llvm/bindings/ocaml/transforms/tapir_opts/tapir_opts_ocaml.c b/llvm/bindings/ocaml/transforms/tapir_opts/tapir_opts_ocaml.c new file mode 100644 index 000000000000000..207b9549b908964 --- /dev/null +++ b/llvm/bindings/ocaml/transforms/tapir_opts/tapir_opts_ocaml.c @@ -0,0 +1,33 @@ +/*===-- tapir_opts_ocaml.c - LLVM OCaml Glue --------------------*- C++ -*-===*\ +|* *| +|* The LLVM Compiler Infrastructure *| +|* *| +|* This file is distributed under the University of Illinois Open Source *| +|* License. See LICENSE.TXT for details. *| +|* *| +|*===----------------------------------------------------------------------===*| +|* *| +|* This file glues LLVM's OCaml interface to its C interface. These functions *| +|* are by and large transparent wrappers to the corresponding C functions. *| +|* *| +\*===----------------------------------------------------------------------===*/ + +#include "caml/custom.h" +#include "llvm-c/Transforms/PassManagerBuilder.h" +#include "llvm-c/Transforms/Tapir.h" + +/* [`Module] Llvm.PassManager.t -> unit + */ +CAMLprim value llvm_add_lower_tapir_to_target(LLVMPassManagerRef PM) +{ + LLVMAddLowerTapirToTargetPass(PM); + return Val_unit; +} + +/* [`Module] Llvm.PassManager.t -> unit + */ +CAMLprim value llvm_add_loop_spawning(LLVMPassManagerRef PM) +{ + LLVMAddLoopSpawningPass(PM); + return Val_unit; +} diff --git a/llvm/examples/Kaleidoscope/CMakeLists.txt b/llvm/examples/Kaleidoscope/CMakeLists.txt index 6ad3b6156647209..89e5ddd06d2492c 100644 --- a/llvm/examples/Kaleidoscope/CMakeLists.txt +++ b/llvm/examples/Kaleidoscope/CMakeLists.txt @@ -15,3 +15,4 @@ add_subdirectory(Chapter6) add_subdirectory(Chapter7) add_subdirectory(Chapter8) add_subdirectory(Chapter9) +add_subdirectory(Tapir) diff --git a/llvm/examples/Kaleidoscope/Tapir/CMakeLists.txt b/llvm/examples/Kaleidoscope/Tapir/CMakeLists.txt new file mode 100644 index 000000000000000..8bd3d0e9f2908f4 --- /dev/null +++ b/llvm/examples/Kaleidoscope/Tapir/CMakeLists.txt @@ -0,0 +1,20 @@ +set(LLVM_LINK_COMPONENTS + Analysis + Core + ExecutionEngine + InstCombine + Object + OrcJIT + RuntimeDyld + ScalarOpts + Support + TapirOpts + TransformUtils + native + ) + +add_kaleidoscope_chapter(Kaleidoscope-Tapir + toy.cpp + ) + +export_executable_symbols(Kaleidoscope-Tapir) diff --git a/llvm/examples/Kaleidoscope/Tapir/KaleidoscopeJIT.h b/llvm/examples/Kaleidoscope/Tapir/KaleidoscopeJIT.h new file mode 100644 index 000000000000000..38da08ffe2c9f44 --- /dev/null +++ b/llvm/examples/Kaleidoscope/Tapir/KaleidoscopeJIT.h @@ -0,0 +1,206 @@ +//===- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Contains a simple JIT definition for use in the kaleidoscope tutorials. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H +#define LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/ExecutionEngine/JITSymbol.h" +#include "llvm/ExecutionEngine/Orc/CompileUtils.h" +#include "llvm/ExecutionEngine/Orc/Core.h" +#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" +#include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h" +#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" +#include "llvm/ExecutionEngine/Orc/IRTransformLayer.h" +#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h" +#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" +#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h" +#include "llvm/ExecutionEngine/SectionMemoryManager.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/FormatVariadic.h" +#include + +namespace llvm { +namespace orc { + +class KaleidoscopeJIT { +private: + std::unique_ptr ES; + + DataLayout DL; + MangleAndInterner Mangle; + + RTDyldObjectLinkingLayer ObjectLayer; + IRCompileLayer CompileLayer; + IRTransformLayer InitHelperTransformLayer; + + JITDylib &MainJD; + + SymbolLookupSet InitFunctions; + SymbolLookupSet DeInitFunctions; + + /// This transform parses llvm.global_ctors to produce a single initialization + /// function for the module, records the function, then deletes + /// llvm.global_ctors. + class GlobalCtorDtorScraper { + public: + GlobalCtorDtorScraper(ExecutionSession &ES, SymbolLookupSet &InitFunctions, + StringRef InitFunctionPrefix) + : ES(ES), InitFunctions(InitFunctions), + InitFunctionPrefix(InitFunctionPrefix) {} + Expected operator()(ThreadSafeModule TSM, + MaterializationResponsibility &R) { + auto Err = TSM.withModuleDo([&](Module &M) -> Error { + auto &Ctx = M.getContext(); + auto *GlobalCtors = M.getNamedGlobal("llvm.global_ctors"); + // If there's no llvm.global_ctors or it's just a decl then skip. + if (!GlobalCtors || GlobalCtors->isDeclaration()) + return Error::success(); + + std::string InitFunctionName; + raw_string_ostream(InitFunctionName) + << InitFunctionPrefix << M.getModuleIdentifier(); + + MangleAndInterner Mangle(ES, M.getDataLayout()); + auto InternedName = Mangle(InitFunctionName); + if (auto Err = R.defineMaterializing( + {{InternedName, JITSymbolFlags::Callable}})) + return Err; + + auto *InitFunc = Function::Create( + FunctionType::get(Type::getVoidTy(Ctx), {}, false), + GlobalValue::ExternalLinkage, InitFunctionName, &M); + InitFunc->setVisibility(GlobalValue::HiddenVisibility); + std::vector> Inits; + for (auto E : getConstructors(M)) + Inits.push_back(std::make_pair(E.Func, E.Priority)); + llvm::sort(Inits, [](const std::pair &LHS, + const std::pair &RHS) { + return LHS.first < RHS.first; + }); + auto *EntryBlock = BasicBlock::Create(Ctx, "entry", InitFunc); + IRBuilder<> IB(EntryBlock); + for (auto &KV : Inits) + IB.CreateCall(KV.first); + IB.CreateRetVoid(); + + ES.runSessionLocked([&]() { InitFunctions.add(InternedName); }); + GlobalCtors->eraseFromParent(); + return Error::success(); + }); + + if (Err) + return std::move(Err); + + return std::move(TSM); + } + + private: + ExecutionSession &ES; + SymbolLookupSet &InitFunctions; + StringRef InitFunctionPrefix; + }; + +public: + KaleidoscopeJIT(std::unique_ptr ES, + JITTargetMachineBuilder JTMB, DataLayout DL) + : ES(std::move(ES)), DL(std::move(DL)), + Mangle(*this->ES, this->DL), + ObjectLayer(*this->ES, + []() { return std::make_unique(); }), + CompileLayer(*this->ES, ObjectLayer, + std::make_unique(std::move(JTMB))), + InitHelperTransformLayer( + *this->ES, CompileLayer, + GlobalCtorDtorScraper(*this->ES, InitFunctions, "my_init.")), + MainJD(this->ES->createBareJITDylib("
")) { + MainJD.addGenerator( + cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess( + DL.getGlobalPrefix()))); + } + + ~KaleidoscopeJIT() { + if (auto Err = ES->endSession()) + ES->reportError(std::move(Err)); + } + + static Expected> Create() { + auto SSP = std::make_shared(); + auto EPC = SelfExecutorProcessControl::Create(); + if (!EPC) + return EPC.takeError(); + + auto ES = std::make_unique(std::move(*EPC)); + + JITTargetMachineBuilder JTMB( + ES->getExecutorProcessControl().getTargetTriple()); + + auto DL = JTMB.getDefaultDataLayoutForTarget(); + if (!DL) + return DL.takeError(); + + return std::make_unique(std::move(ES), + std::move(JTMB), std::move(*DL)); + } + + const DataLayout &getDataLayout() const { return DL; } + + JITDylib &getMainJITDylib() { return MainJD; } + + void loadLibrary(const char *FileName) { + MainJD.addGenerator(cantFail( + DynamicLibrarySearchGenerator::Load(FileName, DL.getGlobalPrefix()))); + } + + Error addModule(ThreadSafeModule TSM, ResourceTrackerSP RT = nullptr) { + if (!RT) + RT = MainJD.getDefaultResourceTracker(); + return InitHelperTransformLayer.add(RT, std::move(TSM)); + } + + Error initialize() { + if (InitFunctions.empty()) + // Nothing to do if there are no initializers. + return Error::success(); + + // Lookup the symbols for the initializer functions. + DenseMap LookupSymbols; + LookupSymbols[&MainJD] = std::move(InitFunctions); + auto LookupResult = Platform::lookupInitSymbols(*ES, LookupSymbols); + if (!LookupResult) + return LookupResult.takeError(); + + // Collect the addresses of those symbols. + std::vector Initializers; + auto InitsItr = LookupResult->find(&MainJD); + for (auto &KV : InitsItr->second) + Initializers.push_back(KV.second.getAddress()); + + // Run all initializer functions. + for (auto InitFnAddr : Initializers) { + auto *InitFn = InitFnAddr.toPtr(); + InitFn(); + } + return Error::success(); + } + + Expected lookup(StringRef Name) { + return ES->lookup({&MainJD}, Mangle(Name.str())); + } +}; + +} // end namespace orc +} // end namespace llvm + +#endif // LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H diff --git a/llvm/examples/Kaleidoscope/Tapir/toy.cpp b/llvm/examples/Kaleidoscope/Tapir/toy.cpp new file mode 100644 index 000000000000000..1aa1e1bb24bd75e --- /dev/null +++ b/llvm/examples/Kaleidoscope/Tapir/toy.cpp @@ -0,0 +1,2163 @@ +#include "KaleidoscopeJIT.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/AliasAnalysisEvaluator.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/TapirRaceDetect.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Verifier.h" +#include "llvm/IRPrinter/IRPrintingPasses.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/Timer.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/Host.h" +#include "llvm/Transforms/IPO/AlwaysInliner.h" +#include "llvm/Transforms/InstCombine/InstCombine.h" +#include "llvm/Transforms/Instrumentation/CilkSanitizer.h" +#include "llvm/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Scalar/IndVarSimplify.h" +#include "llvm/Transforms/Scalar/LICM.h" +#include "llvm/Transforms/Scalar/LoopInstSimplify.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Scalar/LoopRotation.h" +#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" +#include "llvm/Transforms/Scalar/Reassociate.h" +#include "llvm/Transforms/Scalar/SROA.h" +#include "llvm/Transforms/Scalar/SimplifyCFG.h" +#include "llvm/Transforms/Tapir.h" +#include "llvm/Transforms/Tapir/LoopSpawningTI.h" +#include "llvm/Transforms/Tapir/TapirToTarget.h" +#include "llvm/Transforms/Utils.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace llvm; +using namespace llvm::orc; + +//===----------------------------------------------------------------------===// +// Lexer +//===----------------------------------------------------------------------===// + +// The lexer returns tokens [0-255] if it is an unknown character, otherwise one +// of these for known things. +enum Token { + tok_eof = -1, + + // commands + tok_def = -2, + tok_extern = -3, + + // primary + tok_identifier = -4, + tok_number = -5, + tok_integer = -6, + + // control + tok_if = -7, + tok_then = -8, + tok_else = -9, + tok_for = -10, + tok_in = -11, + + // operators + tok_binary = -12, + tok_unary = -13, + + // var definition + tok_var = -14, + + // parallel control + tok_spawn = -15, + tok_sync = -16, + tok_parfor = -17 +}; + +static std::string IdentifierStr; // Filled in if tok_identifier +static int64_t IntVal; // Filled in if tok_integer +static double NumVal; // Filled in if tok_number + +/// gettok - Return the next token from standard input. +static int gettok() { + static int LastChar = ' '; + + // Skip any whitespace. + while (isspace(LastChar)) + LastChar = getchar(); + + if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]* + IdentifierStr = LastChar; + while (isalnum((LastChar = getchar()))) + IdentifierStr += LastChar; + + if (IdentifierStr == "def") + return tok_def; + if (IdentifierStr == "extern") + return tok_extern; + if (IdentifierStr == "if") + return tok_if; + if (IdentifierStr == "then") + return tok_then; + if (IdentifierStr == "else") + return tok_else; + if (IdentifierStr == "for") + return tok_for; + if (IdentifierStr == "in") + return tok_in; + if (IdentifierStr == "binary") + return tok_binary; + if (IdentifierStr == "unary") + return tok_unary; + if (IdentifierStr == "var") + return tok_var; + if (IdentifierStr == "spawn") + return tok_spawn; + if (IdentifierStr == "sync") + return tok_sync; + if (IdentifierStr == "parfor") + return tok_parfor; + return tok_identifier; + } + + { + std::string NumStr; + if (isdigit(LastChar)) { // Integer: [0-9]+ + do { + NumStr += LastChar; + LastChar = getchar(); + } while (isdigit(LastChar)); + if (LastChar != '.') { + IntVal = strtol(NumStr.c_str(), nullptr, 10); + return tok_integer; + } + } + if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+ + // std::string NumStr; + do { + NumStr += LastChar; + LastChar = getchar(); + } while (isdigit(LastChar) || LastChar == '.'); + + NumVal = strtod(NumStr.c_str(), nullptr); + return tok_number; + } + } + + if (LastChar == '#') { + // Comment until end of line. + do + LastChar = getchar(); + while (LastChar != EOF && LastChar != '\n' && LastChar != '\r'); + + if (LastChar != EOF) + return gettok(); + } + + // Check for end of file. Don't eat the EOF. + if (LastChar == EOF) + return tok_eof; + + // Otherwise, just return the character as its ascii value. + int ThisChar = LastChar; + LastChar = getchar(); + return ThisChar; +} + +//===----------------------------------------------------------------------===// +// Abstract Syntax Tree (aka Parse Tree) +//===----------------------------------------------------------------------===// + +namespace { + +/// ExprAST - Base class for all expression nodes. +class ExprAST { +public: + virtual ~ExprAST() = default; + + virtual Value *codegen() = 0; + virtual void setIntegerRes(bool v = true) {} +}; + +/// IntegerExprAST - Expression class for integer literals like "1". +class IntegerExprAST : public ExprAST { + int64_t Val; + +public: + IntegerExprAST(int64_t Val) : Val(Val) {} + + Value *codegen() override; +}; + +/// NumberExprAST - Expression class for numeric literals like "1.0". +class NumberExprAST : public ExprAST { + double Val; + +public: + NumberExprAST(double Val) : Val(Val) {} + + Value *codegen() override; +}; + +/// VariableExprAST - Expression class for referencing a variable, like "a". +class VariableExprAST : public ExprAST { + std::string Name; + +public: + VariableExprAST(const std::string &Name) : Name(Name) {} + + Value *codegen() override; + const std::string &getName() const { return Name; } +}; + +/// UnaryExprAST - Expression class for a unary operator. +class UnaryExprAST : public ExprAST { + char Opcode; + std::unique_ptr Operand; + +public: + UnaryExprAST(char Opcode, std::unique_ptr Operand) + : Opcode(Opcode), Operand(std::move(Operand)) {} + + Value *codegen() override; +}; + +/// BinaryExprAST - Expression class for a binary operator. +class BinaryExprAST : public ExprAST { + char Op; + bool IntegerRes = false; + std::unique_ptr LHS, RHS; + +public: + BinaryExprAST(char Op, std::unique_ptr LHS, + std::unique_ptr RHS) + : Op(Op), LHS(std::move(LHS)), RHS(std::move(RHS)) {} + + Value *codegen() override; + void setIntegerRes(bool v = true) override { IntegerRes = v; } +}; + +/// CallExprAST - Expression class for function calls. +class CallExprAST : public ExprAST { + std::string Callee; + std::vector> Args; + +public: + CallExprAST(const std::string &Callee, + std::vector> Args) + : Callee(Callee), Args(std::move(Args)) {} + + Value *codegen() override; +}; + +/// IfExprAST - Expression class for if/then/else. +class IfExprAST : public ExprAST { + std::unique_ptr Cond, Then, Else; + +public: + IfExprAST(std::unique_ptr Cond, std::unique_ptr Then, + std::unique_ptr Else) + : Cond(std::move(Cond)), Then(std::move(Then)), Else(std::move(Else)) {} + + Value *codegen() override; +}; + +/// ForExprAST - Expression class for for/in. +class ForExprAST : public ExprAST { + std::string VarName; + std::unique_ptr Start, End, Step, Body; + +public: + ForExprAST(const std::string &VarName, std::unique_ptr Start, + std::unique_ptr End, std::unique_ptr Step, + std::unique_ptr Body) + : VarName(VarName), Start(std::move(Start)), End(std::move(End)), + Step(std::move(Step)), Body(std::move(Body)) {} + + Value *codegen() override; +}; + +/// VarExprAST - Expression class for var/in +class VarExprAST : public ExprAST { + std::vector>> VarNames; + std::unique_ptr Body; + +public: + VarExprAST( + std::vector>> VarNames, + std::unique_ptr Body) + : VarNames(std::move(VarNames)), Body(std::move(Body)) {} + + Value *codegen() override; +}; + +/// SpawnExprAST - Expression class for spawn. +class SpawnExprAST : public ExprAST { + std::unique_ptr Spawned; + +public: + SpawnExprAST(std::unique_ptr Spawned) + : Spawned(std::move(Spawned)) {} + + Value *codegen() override; +}; + +/// SyncExprAST - Expression class for spawn. +class SyncExprAST : public ExprAST { +public: + SyncExprAST() {} + + Value *codegen() override; +}; + +/// ParForExprAST - Expression class for parfor/in. +class ParForExprAST : public ExprAST { + std::string VarName; + std::unique_ptr Start, End, Step, Body; + +public: + ParForExprAST(const std::string &VarName, std::unique_ptr Start, + std::unique_ptr End, std::unique_ptr Step, + std::unique_ptr Body) + : VarName(VarName), Start(std::move(Start)), End(std::move(End)), + Step(std::move(Step)), Body(std::move(Body)) {} + + Value *codegen() override; +}; + +/// PrototypeAST - This class represents the "prototype" for a function, +/// which captures its name, and its argument names (thus implicitly the number +/// of arguments the function takes), as well as if it is an operator. +class PrototypeAST { + std::string Name; + std::vector Args; + bool IsOperator; + unsigned Precedence; // Precedence if a binary op. + +public: + PrototypeAST(const std::string &Name, std::vector Args, + bool IsOperator = false, unsigned Prec = 0) + : Name(Name), Args(std::move(Args)), IsOperator(IsOperator), + Precedence(Prec) {} + + Function *codegen(); + const std::string &getName() const { return Name; } + + bool isUnaryOp() const { return IsOperator && Args.size() == 1; } + bool isBinaryOp() const { return IsOperator && Args.size() == 2; } + + char getOperatorName() const { + assert(isUnaryOp() || isBinaryOp()); + return Name[Name.size() - 1]; + } + + unsigned getBinaryPrecedence() const { return Precedence; } +}; + +/// FunctionAST - This class represents a function definition itself. +class FunctionAST { + std::unique_ptr Proto; + std::unique_ptr Body; + +public: + FunctionAST(std::unique_ptr Proto, + std::unique_ptr Body) + : Proto(std::move(Proto)), Body(std::move(Body)) {} + + Function *codegen(); +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// Parser +//===----------------------------------------------------------------------===// + +/// CurTok/getNextToken - Provide a simple token buffer. CurTok is the current +/// token the parser is looking at. getNextToken reads another token from the +/// lexer and updates CurTok with its results. +static int CurTok; +static int getNextToken() { return CurTok = gettok(); } + +/// BinopPrecedence - This holds the precedence for each binary operator that is +/// defined. +static std::map BinopPrecedence; + +/// GetTokPrecedence - Get the precedence of the pending binary operator token. +static int GetTokPrecedence() { + if (!isascii(CurTok)) + return -1; + + // Make sure it's a declared binop. + int TokPrec = BinopPrecedence[CurTok]; + if (TokPrec <= 0) + return -1; + return TokPrec; +} + +/// LogError* - These are little helper functions for error handling. +std::unique_ptr LogError(const char *Str) { + fprintf(stderr, "Error: %s\n", Str); + return nullptr; +} + +std::unique_ptr LogErrorP(const char *Str) { + LogError(Str); + return nullptr; +} + +static std::unique_ptr ParseExpression(); + +/// integerexpr ::= integer +static std::unique_ptr ParseIntegerExpr() { + auto Result = std::make_unique(IntVal); + getNextToken(); // consume the number + return std::move(Result); +} + +/// numberexpr ::= number +static std::unique_ptr ParseNumberExpr() { + auto Result = std::make_unique(NumVal); + getNextToken(); // consume the number + return std::move(Result); +} + +/// parenexpr ::= '(' expression ')' +static std::unique_ptr ParseParenExpr() { + getNextToken(); // eat (. + auto V = ParseExpression(); + if (!V) + return nullptr; + + if (CurTok != ')') + return LogError("expected ')'"); + getNextToken(); // eat ). + return V; +} + +/// identifierexpr +/// ::= identifier +/// ::= identifier '(' expression* ')' +static std::unique_ptr ParseIdentifierExpr() { + std::string IdName = IdentifierStr; + + getNextToken(); // eat identifier. + + if (CurTok != '(') // Simple variable ref. + return std::make_unique(IdName); + + // Call. + getNextToken(); // eat ( + std::vector> Args; + if (CurTok != ')') { + while (true) { + if (auto Arg = ParseExpression()) + Args.push_back(std::move(Arg)); + else + return nullptr; + + if (CurTok == ')') + break; + + if (CurTok != ',') + return LogError("Expected ')' or ',' in argument list"); + getNextToken(); + } + } + + // Eat the ')'. + getNextToken(); + + return std::make_unique(IdName, std::move(Args)); +} + +/// ifexpr ::= 'if' expression 'then' expression 'else' expression +static std::unique_ptr ParseIfExpr() { + getNextToken(); // eat the if. + + // condition. + auto Cond = ParseExpression(); + if (!Cond) + return nullptr; + + if (CurTok != tok_then) + return LogError("expected then"); + getNextToken(); // eat the then + + auto Then = ParseExpression(); + if (!Then) + return nullptr; + + if (CurTok != tok_else) + return LogError("expected else"); + + getNextToken(); + + auto Else = ParseExpression(); + if (!Else) + return nullptr; + + return std::make_unique(std::move(Cond), std::move(Then), + std::move(Else)); +} + +/// forexpr ::= 'for' identifier '=' expr ',' expr (',' expr)? 'in' expression +static std::unique_ptr ParseForExpr() { + getNextToken(); // eat the for. + + if (CurTok != tok_identifier) + return LogError("expected identifier after for"); + + std::string IdName = IdentifierStr; + getNextToken(); // eat identifier. + + if (CurTok != '=') + return LogError("expected '=' after for"); + getNextToken(); // eat '='. + + auto Start = ParseExpression(); + if (!Start) + return nullptr; + if (CurTok != ',') + return LogError("expected ',' after for start value"); + getNextToken(); + + auto End = ParseExpression(); + if (!End) + return nullptr; + + // The step value is optional. + std::unique_ptr Step; + if (CurTok == ',') { + getNextToken(); + Step = ParseExpression(); + if (!Step) + return nullptr; + } + + if (CurTok != tok_in) + return LogError("expected 'in' after for"); + getNextToken(); // eat 'in'. + + auto Body = ParseExpression(); + if (!Body) + return nullptr; + + return std::make_unique(IdName, std::move(Start), std::move(End), + std::move(Step), std::move(Body)); +} + +/// varexpr ::= 'var' identifier ('=' expression)? +// (',' identifier ('=' expression)?)* 'in' expression +static std::unique_ptr ParseVarExpr() { + getNextToken(); // eat the var. + + std::vector>> VarNames; + + // At least one variable name is required. + if (CurTok != tok_identifier) + return LogError("expected identifier after var"); + + while (true) { + std::string Name = IdentifierStr; + getNextToken(); // eat identifier. + + // Read the optional initializer. + std::unique_ptr Init = nullptr; + if (CurTok == '=') { + getNextToken(); // eat the '='. + + Init = ParseExpression(); + if (!Init) + return nullptr; + } + + VarNames.push_back(std::make_pair(Name, std::move(Init))); + + // End of var list, exit loop. + if (CurTok != ',') + break; + getNextToken(); // eat the ','. + + if (CurTok != tok_identifier) + return LogError("expected identifier list after var"); + } + + // At this point, we have to have 'in'. + if (CurTok != tok_in) + return LogError("expected 'in' keyword after 'var'"); + getNextToken(); // eat 'in'. + + auto Body = ParseExpression(); + if (!Body) + return nullptr; + + return std::make_unique(std::move(VarNames), std::move(Body)); +} + +/// spawnexpr ::= 'spawn' expression +static std::unique_ptr ParseSpawnExpr() { + getNextToken(); // eat the spawn. + auto Spawned = ParseExpression(); + if (!Spawned) + return nullptr; + return std::make_unique(std::move(Spawned)); +} + +/// syncexpr ::= 'sync' +static std::unique_ptr ParseSyncExpr() { + getNextToken(); // eat the sync. + return std::make_unique(); +} + +/// parforexpr ::= 'parfor' identifier '=' expr ',' expr (',' expr)? 'in' expression +static std::unique_ptr ParseParForExpr() { + getNextToken(); // eat the parfor. + + if (CurTok != tok_identifier) + return LogError("expected identifier after parfor"); + + std::string IdName = IdentifierStr; + getNextToken(); // eat identifier. + + if (CurTok != '=') + return LogError("expected '=' after for"); + getNextToken(); // eat '='. + + auto Start = ParseExpression(); + if (!Start) + return nullptr; + if (CurTok != ',') + return LogError("expected ',' after for start value"); + getNextToken(); + + auto End = ParseExpression(); + if (!End) + return nullptr; + + // The step value is optional. + std::unique_ptr Step; + if (CurTok == ',') { + getNextToken(); + Step = ParseExpression(); + if (!Step) + return nullptr; + } + + if (CurTok != tok_in) + return LogError("expected 'in' after for"); + getNextToken(); // eat 'in'. + + auto Body = ParseExpression(); + if (!Body) + return nullptr; + + return std::make_unique(IdName, std::move(Start), + std::move(End), std::move(Step), + std::move(Body)); +} + +/// primary +/// ::= identifierexpr +/// ::= integerexpr +/// ::= numberexpr +/// ::= parenexpr +/// ::= ifexpr +/// ::= forexpr +/// ::= varexpr +/// ::= spawnexpr +/// ::= syncexpr +/// ::= parforexpr +static std::unique_ptr ParsePrimary(bool Integer = false) { + switch (CurTok) { + default: + return LogError("unknown token when expecting an expression"); + case tok_identifier: + return ParseIdentifierExpr(); + case tok_integer: + return ParseIntegerExpr(); + case tok_number: + return ParseNumberExpr(); + case '(': + return ParseParenExpr(); + case tok_if: + return ParseIfExpr(); + case tok_for: + return ParseForExpr(); + case tok_var: + return ParseVarExpr(); + case tok_spawn: + return ParseSpawnExpr(); + case tok_sync: + return ParseSyncExpr(); + case tok_parfor: + return ParseParForExpr(); + } +} + +/// unary +/// ::= primary +/// ::= '!' unary +static std::unique_ptr ParseUnary() { + // If the current token is not an operator, it must be a primary expr. + if (!isascii(CurTok) || CurTok == '(' || CurTok == ',') + return ParsePrimary(); + + // If this is a unary operator, read it. + int Opc = CurTok; + getNextToken(); + if (auto Operand = ParseUnary()) + return std::make_unique(Opc, std::move(Operand)); + return nullptr; +} + +/// binoprhs +/// ::= ('+' unary)* +static std::unique_ptr ParseBinOpRHS(int ExprPrec, + std::unique_ptr LHS, + bool Integer = false) { + // If this is a binop, find its precedence. + while (true) { + int TokPrec = GetTokPrecedence(); + + // If this is a binop that binds at least as tightly as the current binop, + // consume it, otherwise we are done. + if (TokPrec < ExprPrec) + return LHS; + + // Okay, we know this is a binop. + int BinOp = CurTok; + getNextToken(); // eat binop + + // Parse the unary expression after the binary operator. + auto RHS = ParseUnary(); + if (!RHS) + return nullptr; + + // If BinOp binds less tightly with RHS than the operator after RHS, let + // the pending operator take RHS as its LHS. + int NextPrec = GetTokPrecedence(); + if (TokPrec < NextPrec) { + RHS = ParseBinOpRHS(TokPrec + 1, std::move(RHS)); + if (!RHS) + return nullptr; + } + + // Merge LHS/RHS. + LHS = + std::make_unique(BinOp, std::move(LHS), std::move(RHS)); + } +} + +/// expression +/// ::= unary binoprhs +/// +static std::unique_ptr ParseExpression() { + auto LHS = ParseUnary(); + if (!LHS) + return nullptr; + + return ParseBinOpRHS(0, std::move(LHS)); +} + +/// prototype +/// ::= id '(' id* ')' +/// ::= binary LETTER number? (id, id) +/// ::= unary LETTER (id) +static std::unique_ptr ParsePrototype() { + std::string FnName; + + unsigned Kind = 0; // 0 = identifier, 1 = unary, 2 = binary. + unsigned BinaryPrecedence = 30; + + switch (CurTok) { + default: + return LogErrorP("Expected function name in prototype"); + case tok_identifier: + FnName = IdentifierStr; + Kind = 0; + getNextToken(); + break; + case tok_unary: + getNextToken(); + if (!isascii(CurTok)) + return LogErrorP("Expected unary operator"); + FnName = "unary"; + FnName += (char)CurTok; + Kind = 1; + getNextToken(); + break; + case tok_binary: + getNextToken(); + if (!isascii(CurTok)) + return LogErrorP("Expected binary operator"); + FnName = "binary"; + FnName += (char)CurTok; + Kind = 2; + getNextToken(); + + // Read the precedence if present. + if (CurTok == tok_integer) { + if (IntVal < 1 || IntVal > 100) + return LogErrorP("Invalid precedence: must be 1..100"); + BinaryPrecedence = (unsigned)IntVal; + getNextToken(); + } + break; + } + + if (CurTok != '(') + return LogErrorP("Expected '(' in prototype"); + + std::vector ArgNames; + while (getNextToken() == tok_identifier) + ArgNames.push_back(IdentifierStr); + if (CurTok != ')') + return LogErrorP("Expected ')' in prototype"); + + // success. + getNextToken(); // eat ')'. + + // Verify right number of names for operator. + if (Kind && ArgNames.size() != Kind) + return LogErrorP("Invalid number of operands for operator"); + + return std::make_unique(FnName, ArgNames, Kind != 0, + BinaryPrecedence); +} + +/// definition ::= 'def' prototype expression +static std::unique_ptr ParseDefinition() { + getNextToken(); // eat def. + auto Proto = ParsePrototype(); + if (!Proto) + return nullptr; + + if (auto E = ParseExpression()) + return std::make_unique(std::move(Proto), std::move(E)); + return nullptr; +} + +/// toplevelexpr ::= expression +static std::unique_ptr ParseTopLevelExpr() { + if (auto E = ParseExpression()) { + // Make an anonymous proto. + auto Proto = std::make_unique("__anon_expr", + std::vector()); + return std::make_unique(std::move(Proto), std::move(E)); + } + return nullptr; +} + +/// external ::= 'extern' prototype +static std::unique_ptr ParseExtern() { + getNextToken(); // eat extern. + return ParsePrototype(); +} + +//===----------------------------------------------------------------------===// +// Code Generation +//===----------------------------------------------------------------------===// + +static std::unique_ptr TheContext; +static std::unique_ptr TheModule; +static std::unique_ptr> Builder; +static std::map NamedValues; +static std::unique_ptr TheJIT; +static std::map> FunctionProtos; +static ExitOnError ExitOnErr; + +// Variables for codegen for the current task scope. + +// TaskScopeEntry keeps track of the entry basic block of the function +// or nested task being emitted. +static BasicBlock *TaskScopeEntry = nullptr; + +// TaskScopeSyncRegion keeps track of a call to +// @llvm.syncregion.start() in TaskScopeEntry, if one exists. +static Value *TaskScopeSyncRegion = nullptr; + +// Flags controlled from the command line. +static bool Optimize = true; +static bool RunCilksan = false; +enum PrintIRLevel { + PrintIR_None = 0, + PrintIR_BeforeOpt = 0x1, + PrintIR_BeforeTapirLowering = 0x2, + PrintIR_AfterTapirLoopSpawning = 0x4, + PrintIR_AfterTapirLowering = 0x8, +}; +static bool PrintIRBeforeOpt(PrintIRLevel Level) { + return (static_cast(Level) & static_cast(PrintIR_BeforeOpt)) == + static_cast(PrintIR_BeforeOpt); +} +static bool PrintIRBeforeTapirLowering(PrintIRLevel Level) { + return (static_cast(Level) & + static_cast(PrintIR_BeforeTapirLowering)) == + static_cast(PrintIR_BeforeTapirLowering); +} +static bool PrintIRAfterTapirLoopSpawning(PrintIRLevel Level) { + return (static_cast(Level) & + static_cast(PrintIR_AfterTapirLoopSpawning)) == + static_cast(PrintIR_AfterTapirLoopSpawning); +} +static bool PrintIRAfterTapirLowering(PrintIRLevel Level) { + return (static_cast(Level) & + static_cast(PrintIR_AfterTapirLowering)) == + static_cast(PrintIR_AfterTapirLowering); +} +static PrintIRLevel setPrintIRBeforeOpt(PrintIRLevel Level) { + return static_cast(static_cast(Level) | + static_cast(PrintIR_BeforeOpt)); +} +static PrintIRLevel setPrintIRBeforeTapirLowering(PrintIRLevel Level) { + return static_cast( + static_cast(Level) | static_cast(PrintIR_BeforeTapirLowering)); +} +static PrintIRLevel setPrintIRAfterTapirLoopSpawning(PrintIRLevel Level) { + return static_cast( + static_cast(Level) | + static_cast(PrintIR_AfterTapirLoopSpawning)); +} +static PrintIRLevel setPrintIRAfterTapirLowering(PrintIRLevel Level) { + return static_cast( + static_cast(Level) | static_cast(PrintIR_AfterTapirLowering)); +} +static PrintIRLevel PrintIRLvl = PrintIR_None; +// Options related to Tapir lowering. +static TapirTargetID TheTapirTarget; +static std::string OpenCilkRuntimeBCPath; + +Value *LogErrorV(const char *Str) { + LogError(Str); + return nullptr; +} + +Function *getFunction(std::string Name) { + // First, see if the function has already been added to the current module. + if (auto *F = TheModule->getFunction(Name)) + return F; + + // If not, check whether we can codegen the declaration from some existing + // prototype. + auto FI = FunctionProtos.find(Name); + if (FI != FunctionProtos.end()) + return FI->second->codegen(); + + // If no existing prototype exists, return null. + return nullptr; +} + +/// CreateEntryBlockAlloca - Create an alloca instruction in the entry block of +/// the function. This is used for mutable variables etc. +static AllocaInst *CreateEntryBlockAlloca(Function *TheFunction, + StringRef VarName) { + IRBuilder<> TmpB(&TheFunction->getEntryBlock(), + TheFunction->getEntryBlock().begin()); + return TmpB.CreateAlloca(Type::getDoubleTy(*TheContext), nullptr, VarName); +} + +/// CreateTaskEntryBlockAlloca - Create an alloca instruction in the entry block +/// of the current task. This is used for mutable variables etc. +/// +/// Requires the CFG of the function to be constructed up to BB. +static AllocaInst *CreateTaskEntryBlockAlloca(StringRef VarName, + Type *AllocaTy = + Type::getDoubleTy(*TheContext)) { + BasicBlock *TaskEntry = TaskScopeEntry; + if (!TaskEntry) { + LogError("No local task scope."); + return nullptr; + } + IRBuilder<> TmpB(TaskEntry, TaskEntry->begin()); + return TmpB.CreateAlloca(AllocaTy, nullptr, VarName); +} + +Value *IntegerExprAST::codegen() { + return ConstantInt::get(*TheContext, APSInt::get(Val)); +} + +Value *NumberExprAST::codegen() { + return ConstantFP::get(*TheContext, APFloat(Val)); +} + +Value *VariableExprAST::codegen() { + // Look this variable up in the function. + Value *V = NamedValues[Name]; + if (!V) + return LogErrorV("Unknown variable name"); + + if (!isa(V)) + return V; + + AllocaInst *A = cast(V); + // Load the value. + return Builder->CreateLoad(A->getAllocatedType(), A, Name.c_str()); +} + +Value *UnaryExprAST::codegen() { + Value *OperandV = Operand->codegen(); + if (!OperandV) + return nullptr; + + Function *F = getFunction(std::string("unary") + Opcode); + if (!F) + return LogErrorV("Unknown unary operator"); + + return Builder->CreateCall(F, OperandV, "unop"); +} + +Value *BinaryExprAST::codegen() { + // Special case '=' because we don't want to emit the LHS as an expression. + if (Op == '=') { + // Assignment requires the LHS to be an identifier. + // This assume we're building without RTTI because LLVM builds that way by + // default. If you build LLVM with RTTI this can be changed to a + // dynamic_cast for automatic error checking. + VariableExprAST *LHSE = static_cast(LHS.get()); + if (!LHSE) + return LogErrorV("destination of '=' must be a variable"); + // Codegen the RHS. + Value *Val = RHS->codegen(); + if (!Val) + return nullptr; + + // Look up the name. + Value *Variable = NamedValues[LHSE->getName()]; + if (!Variable) + return LogErrorV("Unknown variable name"); + + Builder->CreateStore(Val, Variable); + return Val; + } + + Value *L = LHS->codegen(); + Value *R = RHS->codegen(); + if (!L || !R) + return nullptr; + Type *LTy = L->getType(); + Type *RTy = R->getType(); + bool IntegerOp = IntegerRes || + (LTy->isIntegerTy() && RTy->isIntegerTy()); + // Cast the operand types if necessary + if (!IntegerOp) { + if (LTy->isIntegerTy()) + L = Builder->CreateSIToFP(L, Type::getDoubleTy(*TheContext)); + if (RTy->isIntegerTy()) + R = Builder->CreateSIToFP(R, Type::getDoubleTy(*TheContext)); + } else if (IntegerRes) { + if (!LTy->isIntegerTy()) + L = Builder->CreateFPToSI(L, Type::getInt64Ty(*TheContext)); + if (!RTy->isIntegerTy()) + R = Builder->CreateFPToSI(R, Type::getInt64Ty(*TheContext)); + } + // Create the appropriate operation + switch (Op) { + case '+': + if (IntegerOp) + return Builder->CreateAdd(L, R, "addtmp"); + return Builder->CreateFAdd(L, R, "addtmp"); + case '-': + if (IntegerOp) + return Builder->CreateSub(L, R, "subtmp"); + return Builder->CreateFSub(L, R, "subtmp"); + case '*': + if (IntegerOp) + return Builder->CreateMul(L, R, "multmp"); + return Builder->CreateFMul(L, R, "multmp"); + case '<': + if (IntegerOp) { + L = Builder->CreateICmpSLT(L, R, "cmptmp"); + return Builder->CreateZExt(L, Type::getInt64Ty(*TheContext), "booltmp"); + } + L = Builder->CreateFCmpULT(L, R, "cmptmp"); + // Convert bool 0/1 to double 0.0 or 1.0 + return Builder->CreateUIToFP(L, Type::getDoubleTy(*TheContext), "booltmp"); + default: + break; + } + + // If it wasn't a builtin binary operator, it must be a user defined one. Emit + // a call to it. + Function *F = getFunction(std::string("binary") + Op); + assert(F && "binary operator not found!"); + + Value *Ops[] = {L, R}; + return Builder->CreateCall(F, Ops, "binop"); +} + +Value *CallExprAST::codegen() { + // Look up the name in the global module table. + Function *CalleeF = getFunction(Callee); + if (!CalleeF) + return LogErrorV("Unknown function referenced"); + + // If argument mismatch error. + if (CalleeF->arg_size() != Args.size()) + return LogErrorV("Incorrect # arguments passed"); + + std::vector ArgsV; + for (unsigned i = 0, e = Args.size(); i != e; ++i) { + Value *ArgVal = Args[i]->codegen(); + if (ArgVal->getType()->isIntegerTy()) + ArgVal = Builder->CreateSIToFP(ArgVal, Type::getDoubleTy(*TheContext)); + ArgsV.push_back(ArgVal); + if (!ArgsV.back()) + return nullptr; + } + + return Builder->CreateCall(CalleeF, ArgsV, "calltmp"); +} + +Value *IfExprAST::codegen() { + Value *CondV = Cond->codegen(); + if (!CondV) + return nullptr; + + // Convert condition to a bool by comparing non-equal to 0.0. + CondV = Builder->CreateFCmpONE( + CondV, ConstantFP::get(*TheContext, APFloat(0.0)), "ifcond"); + + Function *TheFunction = Builder->GetInsertBlock()->getParent(); + + // Create blocks for the then and else cases. Insert the 'then' block at the + // end of the function. + BasicBlock *ThenBB = BasicBlock::Create(*TheContext, "then", TheFunction); + BasicBlock *ElseBB = BasicBlock::Create(*TheContext, "else"); + BasicBlock *MergeBB = BasicBlock::Create(*TheContext, "ifcont"); + + Builder->CreateCondBr(CondV, ThenBB, ElseBB); + + // Emit then value. + Builder->SetInsertPoint(ThenBB); + + Value *ThenV = Then->codegen(); + if (!ThenV) + return nullptr; + + Builder->CreateBr(MergeBB); + // Codegen of 'Then' can change the current block, update ThenBB for the PHI. + ThenBB = Builder->GetInsertBlock(); + + // Emit else block. + TheFunction->insert(TheFunction->end(), ElseBB); + Builder->SetInsertPoint(ElseBB); + + Value *ElseV = Else->codegen(); + if (!ElseV) + return nullptr; + + Builder->CreateBr(MergeBB); + // Codegen of 'Else' can change the current block, update ElseBB for the PHI. + ElseBB = Builder->GetInsertBlock(); + + // Emit merge block. + TheFunction->insert(TheFunction->end(), MergeBB); + Builder->SetInsertPoint(MergeBB); + bool IntegerType = (ThenV->getType()->isIntegerTy() && + ElseV->getType()->isIntegerTy()); + Type *PNTy = IntegerType ? Type::getInt64Ty(*TheContext) : + Type::getDoubleTy(*TheContext); + PHINode *PN = Builder->CreatePHI(PNTy, 2, "iftmp"); + if (!IntegerType) { + if (ThenV->getType()->isIntegerTy()) + ThenV = Builder->CreateSIToFP(ThenV, Type::getDoubleTy(*TheContext)); + if (ElseV->getType()->isIntegerTy()) + ElseV = Builder->CreateSIToFP(ElseV, Type::getDoubleTy(*TheContext)); + } + PN->addIncoming(ThenV, ThenBB); + PN->addIncoming(ElseV, ElseBB); + return PN; +} + +// Output for-loop as: +// var = alloca double +// ... +// start = startexpr +// store start -> var +// br cond +// cond: +// endcond = endexpr +// br endcond, loop, afterloop +// loop: +// ... +// bodyexpr +// ... +// loopend: +// step = stepexpr +// curvar = load var +// nextvar = curvar + step +// store nextvar -> var +// br cond +// afterloop: +Value *ForExprAST::codegen() { + Function *TheFunction = Builder->GetInsertBlock()->getParent(); + + // Create an alloca for the variable in the entry block. + AllocaInst *Alloca = CreateTaskEntryBlockAlloca(VarName); + + // Emit the start code first, without 'variable' in scope. + Value *StartVal = Start->codegen(); + if (!StartVal) + return nullptr; + if (StartVal->getType()->isIntegerTy()) + StartVal = Builder->CreateSIToFP(StartVal, Type::getDoubleTy(*TheContext)); + + // Store the value into the alloca. + Builder->CreateStore(StartVal, Alloca); + + // Make the new basic block for the loop header, inserting after current + // block. + BasicBlock *CondBB = BasicBlock::Create(*TheContext, "cond", TheFunction); + BasicBlock *LoopBB = BasicBlock::Create(*TheContext, "loop", TheFunction); + BasicBlock *AfterBB = BasicBlock::Create(*TheContext, "afterloop"); + + // Insert an explicit fall through from the current block to the CondBB. + Builder->CreateBr(CondBB); + + // Start insertion in CondBB. + Builder->SetInsertPoint(CondBB); + + // Within the loop, the variable is defined equal to the PHI node. If it + // shadows an existing variable, we have to restore it, so save it now. + Value *OldVal = NamedValues[VarName]; + NamedValues[VarName] = Alloca; + + // Compute the end condition. + Value *EndCond = End->codegen(); + if (!EndCond) + return nullptr; + + // Convert condition to a bool by comparing non-equal to 0.0. + EndCond = Builder->CreateFCmpONE( + EndCond, ConstantFP::get(*TheContext, APFloat(0.0)), "loopcond"); + + // Insert the conditional branch into the end of LoopEndBB. + Builder->CreateCondBr(EndCond, LoopBB, AfterBB); + + // Start insertion in LoopBB. + Builder->SetInsertPoint(LoopBB); + + // Emit the body of the loop. This, like any other expr, can change the + // current BB. Note that we ignore the value computed by the body, but don't + // allow an error. + if (!Body->codegen()) + return nullptr; + + // Emit the step value. + Value *StepVal = nullptr; + if (Step) { + StepVal = Step->codegen(); + if (!StepVal) + return nullptr; + } else { + // If not specified, use 1.0. + StepVal = ConstantFP::get(*TheContext, APFloat(1.0)); + } + + // Reload, increment, and restore the alloca. This handles the case where + // the body of the loop mutates the variable. + Value *CurVar = + Builder->CreateLoad(Alloca->getAllocatedType(), Alloca, VarName.c_str()); + Value *NextVar = Builder->CreateFAdd(CurVar, StepVal, "nextvar"); + Builder->CreateStore(NextVar, Alloca); + + // Insert a back edge to CondBB. + Builder->CreateBr(CondBB); + + // Emit the "after loop" block. + TheFunction->insert(TheFunction->end(), AfterBB); + + // Any new code will be inserted in AfterBB. + Builder->SetInsertPoint(AfterBB); + + // Restore the unshadowed variable. + if (OldVal) + NamedValues[VarName] = OldVal; + else + NamedValues.erase(VarName); + + // for expr always returns 0.0. + return Constant::getNullValue(Type::getDoubleTy(*TheContext)); +} + +Value *VarExprAST::codegen() { + std::vector OldBindings; + + // Register all variables and emit their initializer. + for (unsigned i = 0, e = VarNames.size(); i != e; ++i) { + const std::string &VarName = VarNames[i].first; + ExprAST *Init = VarNames[i].second.get(); + + // Emit the initializer before adding the variable to scope, this prevents + // the initializer from referencing the variable itself, and permits stuff + // like this: + // var a = 1 in + // var a = a in ... # refers to outer 'a'. + Value *InitVal; + if (Init) { + InitVal = Init->codegen(); + if (!InitVal) + return nullptr; + } else { // If not specified, use 0.0. + InitVal = ConstantFP::get(*TheContext, APFloat(0.0)); + } + + AllocaInst *Alloca = CreateTaskEntryBlockAlloca(VarName, InitVal->getType()); + Builder->CreateStore(InitVal, Alloca); + + // Remember the old variable binding so that we can restore the binding when + // we unrecurse. + OldBindings.push_back(NamedValues[VarName]); + + // Remember this binding. + NamedValues[VarName] = Alloca; + } + + // Codegen the body, now that all vars are in scope. + Value *BodyVal = Body->codegen(); + if (!BodyVal) + return nullptr; + + // Pop all our variables from scope. + for (unsigned i = 0, e = VarNames.size(); i != e; ++i) + NamedValues[VarNames[i].first] = OldBindings[i]; + + // Return the body computation. + return BodyVal; +} + +// RAII class to manage the entry block and sync region in each nested task +// scope. +class TaskScopeRAII { + BasicBlock *OldTaskScopeEntry; + Value *OldSyncRegion = nullptr; +public: + explicit TaskScopeRAII(BasicBlock *NewTaskScopeEntry) : + OldTaskScopeEntry(TaskScopeEntry), OldSyncRegion(TaskScopeSyncRegion) { + TaskScopeEntry = NewTaskScopeEntry; + TaskScopeSyncRegion = nullptr; + } + ~TaskScopeRAII() { + TaskScopeEntry = OldTaskScopeEntry; + TaskScopeSyncRegion = OldSyncRegion; + } +}; + +// Helper method for creating sync regions. +static Value *CreateSyncRegion(Module &M) { + BasicBlock *TaskEntry = TaskScopeEntry; + if (!TaskEntry) + return LogErrorV("No local task scope."); + IRBuilder<> TmpB(TaskEntry, TaskEntry->begin()); + return TmpB.CreateCall( + Intrinsic::getDeclaration(&M, Intrinsic::syncregion_start), {}); +} + +// Output spawn spawned_expr as: +// sync_region = call token @llvm.syncregion.start() +// ... +// detach within sync_region, label detachbb, label continbb +// detachbb: +// ... +// spawned_expr +// ... +// reattach within sync_region, continbb +// continbb: +Value *SpawnExprAST::codegen() { + // Create a sync region for the local function or task scope, if necessary. + if (!TaskScopeSyncRegion) + TaskScopeSyncRegion = CreateSyncRegion(*TheModule); + // Get the sync region for this task scope. + Value *SyncRegion = TaskScopeSyncRegion; + Function *TheFunction = Builder->GetInsertBlock()->getParent(); + + // Create the detach and continue blocks. Insert the continue block + // at the end of the function. + BasicBlock *DetachBB = BasicBlock::Create(*TheContext, "detachbb", + TheFunction); + // We hold off inserting ContinueBB into TheFunction until after we + // emit the spawned statement, to make the final LLVM IR a bit + // cleaner. + BasicBlock *ContinueBB = BasicBlock::Create(*TheContext, "continbb"); + + // Create the detach and prepare to emit the spawned expression starting in + // the detach block. + Builder->CreateDetach(DetachBB, ContinueBB, SyncRegion); + Builder->SetInsertPoint(DetachBB); + + // Emit the spawned computation. + { + TaskScopeRAII TaskScope(DetachBB); + // Emit the spawned expr. This, like any other expr, can change the current + // BB. + if (!Spawned->codegen()) + return nullptr; + + // Emit a reattach to the continue block. + Builder->CreateReattach(ContinueBB, SyncRegion); + } + + TheFunction->insert(TheFunction->end(), ContinueBB); + Builder->SetInsertPoint(ContinueBB); + + // Return a default value of 0.0. + return Constant::getNullValue(Type::getDoubleTy(*TheContext)); +} + +Value *SyncExprAST::codegen() { + // Create a sync region for the local function or task scope, if necessary. + if (!TaskScopeSyncRegion) + TaskScopeSyncRegion = CreateSyncRegion(*TheModule); + // Get the sync region for this task scope. + Value *SyncRegion = TaskScopeSyncRegion; + Function *TheFunction = Builder->GetInsertBlock()->getParent(); + + // Create a continuation block for the sync. + BasicBlock *SyncContinueBB = BasicBlock::Create(*TheContext, "sync.continue", + TheFunction); + + // Create the sync, and set the insert point to the continue block. + Builder->CreateSync(SyncContinueBB, SyncRegion); + Builder->SetInsertPoint(SyncContinueBB); + + // Return a default value of 0.0. + return Constant::getNullValue(Type::getDoubleTy(*TheContext)); +} + +static std::vector GetTapirLoopMetadata() { + std::string TapirLoopSpawningStrategy = "tapir.loop.spawn.strategy"; + const int32_t DACLoopSpawning = 1; + std::vector Result; + + // Add the DAC loop-spawning strategy for Tapir loops. + Result.push_back(MDNode::get(*TheContext, + { MDString::get(*TheContext, + TapirLoopSpawningStrategy), + ConstantAsMetadata::get( + Builder->getInt32(DACLoopSpawning)) })); + + return Result; +} + +// Output parfor-loop as: +// sr = call token @llvm.syncregion.start +// ... +// start = startexpr +// br pcond +// pcond: +// variable = phi [start, loopheader], [nextvar, loopend] +// endcond = endexpr +// br endcond, ploop, afterloop +// ploop: +// detach within sr, ploop.bodyentry, ploop.continue +// ploop.bodyentry: +// var = alloca double +// store variable -> var +// ... +// bodyexpr +// ... +// reattach within sr, ploop.continue +// ploop.continue: +// step = stepexpr +// nextvar = variable + step +// br cond +// afterloop: +// sync within sr, aftersync +// aftersync: +Value *ParForExprAST::codegen() { + Function *TheFunction = Builder->GetInsertBlock()->getParent(); + + // Emit the start code first, without 'variable' in scope. + Value *StartVal = Start->codegen(); + if (!StartVal) + return nullptr; + + // Make the new basic block for the loop header, inserting after current + // block. + BasicBlock *PreheaderBB = Builder->GetInsertBlock(); + BasicBlock *CondBB = BasicBlock::Create(*TheContext, "pcond", TheFunction); + BasicBlock *LoopBB = BasicBlock::Create(*TheContext, "ploop", TheFunction); + BasicBlock *AfterBB = BasicBlock::Create(*TheContext, "afterloop"); + + // [Tapir] Create a sync region just for the loop, so we can sync + // the loop iterations separately from other spawns in the same + // function. + Value *SyncRegion = CreateSyncRegion(*TheFunction->getParent()); + + // Insert an explicit fall through from the current block to the CondBB. + Builder->CreateBr(CondBB); + + // Start insertion in CondBB. + Builder->SetInsertPoint(CondBB); + + // Start the PHI node with an entry for Start. + // [Tapir] Note: For the LoopSpawning pass to work, we ensure that + // Variable is an integer. + PHINode *Variable = + Builder->CreatePHI(Type::getInt64Ty(*TheContext), 2, VarName); + Variable->addIncoming(StartVal, PreheaderBB); + + // Within the parallel loop, we use new different copies of the variable. + // Save any existing variables that are shadowed. + Value *OldVal = NamedValues[VarName]; + // For the end condition, use the PHI node as the variable VarName. + NamedValues[VarName] = Variable; + + // If the end is a binary expression, force it to produce an integer result. + End->setIntegerRes(); + // Compute the end condition. + Value *EndCond = End->codegen(); + if (!EndCond) + return nullptr; + // [Tapir] Note: For the LoopSpawning pass to work, we ensure that + // EndCond is an integer. + if (!EndCond->getType()->isIntegerTy()) + EndCond = Builder->CreateFPToSI(EndCond, Type::getInt64Ty(*TheContext)); + + // Convert condition to a bool by comparing non-equal to 0. + EndCond = Builder->CreateICmpNE( + EndCond, ConstantInt::get(*TheContext, APSInt::get(0)), "loopcond"); + + // Insert the conditional branch to either LoopBB or AfterBB. + Builder->CreateCondBr(EndCond, LoopBB, AfterBB); + + // Start insertion in LoopBB. + Builder->SetInsertPoint(LoopBB); + + // [Tapir] Create a block for detaching the loop body and a block + // for the continuation of the detach. + BasicBlock *DetachBB = + BasicBlock::Create(*TheContext, "ploop.bodyentry", TheFunction); + BasicBlock *ContinueBB = + BasicBlock::Create(*TheContext, "ploop.continue"); + + // [Tapir] Insert a detach to spawn the loop body. + Builder->CreateDetach(DetachBB, ContinueBB, SyncRegion); + Builder->SetInsertPoint(DetachBB); + + // [Tapir] Emit the spawned loop body. + { + // [Tapir] Create a nested task scope corresponding to the loop + // body, to allow for nested spawns and parallel loops in the + // parallel-loop body. + TaskScopeRAII TaskScope(DetachBB); + + // To avoid races, within the parallel loop's body, the variable + // is stored in a task-local allocation. Create an alloca in the + // task's entry block for this version of the variable. + AllocaInst *VarAlloca = + CreateTaskEntryBlockAlloca(VarName, Type::getInt64Ty(*TheContext)); + // Store the value into the alloca. + Builder->CreateStore(Variable, VarAlloca); + NamedValues[VarName] = VarAlloca; + + // Emit the body of the loop. This, like any other expr, can change the + // current BB. Note that we ignore the value computed by the body, but + // don't allow an error. + if (!Body->codegen()) + return nullptr; + + // [Tapir] Emit the reattach to terminate the task containing the + // body of the parallel loop. + Builder->CreateReattach(ContinueBB, SyncRegion); + } + + // Emit the continue block of the detach. + TheFunction->insert(TheFunction->end(), ContinueBB); + + // Set the insertion point to the continue block of the detach. + Builder->SetInsertPoint(ContinueBB); + + // Emit the step value. + Value *StepVal = nullptr; + if (Step) { + StepVal = Step->codegen(); + if (!StepVal) + return nullptr; + } else { + // If not specified, use 1. + // [Tapir] For the LoopSpawning pass to work, we ensure that + // StepVal is an integer. + StepVal = ConstantInt::get(*TheContext, APSInt::get(1)); + } + Value *NextVar = Builder->CreateAdd(Variable, StepVal, "nextvar"); + + // Insert a back edge to CondBB + BranchInst *BackEdge = Builder->CreateBr(CondBB); + + // [Tapir] Emit loop metadata, so LoopSpawning will work on this + // loop. + std::vector LoopMetadata = GetTapirLoopMetadata(); + if (!LoopMetadata.empty()) { + auto TempNode = MDNode::getTemporary(*TheContext, std::nullopt); + LoopMetadata.insert(LoopMetadata.begin(), TempNode.get()); + auto LoopID = MDNode::get(*TheContext, LoopMetadata); + LoopID->replaceOperandWith(0, LoopID); + BackEdge->setMetadata(LLVMContext::MD_loop, LoopID); + } + + // Add a new entry to the PHI node for the backedge. + Variable->addIncoming(NextVar, ContinueBB); + + // Emit the "after loop" block. + TheFunction->insert(TheFunction->end(), AfterBB); + + // Any new code will be inserted in AfterBB. + Builder->SetInsertPoint(AfterBB); + + // [Tapir] Create the "after sync" block and insert it. + BasicBlock *AfterSync = + BasicBlock::Create(*TheContext, "aftersync", TheFunction); + + // [Tapir] Insert a sync for the loop. + Builder->CreateSync(AfterSync, SyncRegion); + Builder->SetInsertPoint(AfterSync); + + // Restore the unshadowed variable. + if (OldVal) + NamedValues[VarName] = OldVal; + else + NamedValues.erase(VarName); + + // parfor expr always returns 0.0. + return Constant::getNullValue(Type::getDoubleTy(*TheContext)); +} + +Function *PrototypeAST::codegen() { + // Make the function type: double(double,double) etc. + std::vector Doubles(Args.size(), Type::getDoubleTy(*TheContext)); + FunctionType *FT = + FunctionType::get(Type::getDoubleTy(*TheContext), Doubles, false); + + Function *F = + Function::Create(FT, Function::ExternalLinkage, Name, TheModule.get()); + + // Set names for all arguments. + unsigned Idx = 0; + for (auto &Arg : F->args()) + Arg.setName(Args[Idx++]); + + return F; +} + +static void CreateOptimizationPassPipeline(ModulePassManager &MPM); + +static void RunOptimizations() { + LoopAnalysisManager LAM; + FunctionAnalysisManager FAM; + ModuleAnalysisManager MAM; + + // Create TargetLibraryInfo for setting the target of Tapir lowering. + Triple TargetTriple(TheModule->getTargetTriple()); + TargetLibraryInfoImpl TLII(TargetTriple); + + // Set the target for Tapir lowering to the Cilk runtime system. + TLII.setTapirTarget(TheTapirTarget); + if (TapirTargetID::OpenCilk == TheTapirTarget) + TLII.setTapirTargetOptions( + std::make_unique(OpenCilkRuntimeBCPath)); + + // Add the TargetLibraryInfo to the pass manager. + FAM.registerPass([&] { return TargetLibraryAnalysis(TLII); }); + // Register necessary analyses. + FAM.registerPass([&] { + AAManager AA; + AA.registerFunctionAnalysis(); + return AA; + }); + FAM.registerPass([&] { return AssumptionAnalysis(); }); + FAM.registerPass([&] { return BasicAA(); }); + FAM.registerPass([&] { return BlockFrequencyAnalysis(); }); + FAM.registerPass([&] { return BranchProbabilityAnalysis(); }); + FAM.registerPass([&] { return DependenceAnalysis(); }); + FAM.registerPass([&] { return DominatorTreeAnalysis(); }); + FAM.registerPass([&] { return LoopAnalysis(); }); + FAM.registerPass([&] { return MemoryDependenceAnalysis(); }); + FAM.registerPass([&] { return MemorySSAAnalysis(); }); + FAM.registerPass([&] { return OptimizationRemarkEmitterAnalysis(); }); + FAM.registerPass([&] { return PostDominatorTreeAnalysis(); }); + FAM.registerPass([&] { return ScalarEvolutionAnalysis(); }); + FAM.registerPass([&] { return TapirRaceDetect(); }); + FAM.registerPass([&] { return TargetIRAnalysis(); }); + FAM.registerPass([&] { return TaskAnalysis(); }); + LAM.registerPass([&] { return PassInstrumentationAnalysis(); }); + FAM.registerPass([&] { return PassInstrumentationAnalysis(); }); + MAM.registerPass([&] { return CallGraphAnalysis(); }); + MAM.registerPass([&] { return PassInstrumentationAnalysis(); }); + MAM.registerPass([&] { return ProfileSummaryAnalysis(); }); + // Cross-register analysis proxies. + MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); }); + FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); }); + FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); }); + LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); }); + + // Build the optimization pipeline. + ModulePassManager MPM; + CreateOptimizationPassPipeline(MPM); + // Run the optimizer on the function. + MPM.run(*TheModule, MAM); +} + +Function *FunctionAST::codegen() { + // Transfer ownership of the prototype to the FunctionProtos map, but keep a + // reference to it for use below. + auto &P = *Proto; + FunctionProtos[Proto->getName()] = std::move(Proto); + Function *TheFunction = getFunction(P.getName()); + if (!TheFunction) + return nullptr; + + // If this is an operator, install it. + if (P.isBinaryOp()) + BinopPrecedence[P.getOperatorName()] = P.getBinaryPrecedence(); + + // Create a new basic block to start insertion into. + BasicBlock *BB = BasicBlock::Create(*TheContext, "entry", TheFunction); + Builder->SetInsertPoint(BB); + + // Record the function arguments in the NamedValues map. + NamedValues.clear(); + for (auto &Arg : TheFunction->args()) { + // Create an alloca for this variable. + AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, Arg.getName()); + + // Store the initial value into the alloca. + Builder->CreateStore(&Arg, Alloca); + + // Add arguments to variable symbol table. + NamedValues[std::string(Arg.getName())] = Alloca; + } + + TaskScopeRAII TaskScope(BB); + if (Value *RetVal = Body->codegen()) { + // Finish off the function. + if (RetVal->getType()->isIntegerTy()) + RetVal = Builder->CreateSIToFP(RetVal, Type::getDoubleTy(*TheContext)); + Builder->CreateRet(RetVal); + + TheFunction->setDoesNotThrow(); + + // Mark the function for race-detection + if (RunCilksan) + TheFunction->addFnAttr(Attribute::SanitizeCilk); + + // Validate the generated code, checking for consistency. + verifyFunction(*TheFunction); + + // Run the optimizer on the function. + RunOptimizations(); + + return TheFunction; + } + + // Error reading body, remove function. + TheFunction->eraseFromParent(); + + if (P.isBinaryOp()) + BinopPrecedence.erase(P.getOperatorName()); + return nullptr; +} + +//===----------------------------------------------------------------------===// +// Top-Level parsing and JIT Driver +//===----------------------------------------------------------------------===// + +static void AddTapirLoweringPasses(ModulePassManager &MPM); + +static void CreateOptimizationPassPipeline(ModulePassManager &MPM) { + if (PrintIRBeforeOpt(PrintIRLvl)) { + MPM.addPass(createModuleToFunctionPassAdaptor( + PrintFunctionPass(errs(), "IR dump before optimizations"))); + } + + if (Optimize) { + FunctionPassManager FPM; + // Promote memory to registers. + FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); + // Catch trivial redundancies + FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); + // Do simple "peephole" optimizations and bit-twiddling optzns. + FPM.addPass(InstCombinePass()); + // Reassociate expressions. + FPM.addPass(ReassociatePass()); + // Eliminate Common SubExpressions. + FPM.addPass(GVNPass()); + // Simplify the control flow graph (deleting unreachable blocks, etc). + FPM.addPass(SimplifyCFGPass()); + + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + + // If requested, run the CilkSanitizer pass. + if (RunCilksan) { + MPM.addPass(CSISetupPass()); + MPM.addPass(CilkSanitizerPass()); + } + + if (PrintIRBeforeTapirLowering(PrintIRLvl)) { + MPM.addPass(createModuleToFunctionPassAdaptor( + PrintFunctionPass(errs(), "IR dump before Tapir lowering"))); + } + + // Add Tapir lowering passes. + AddTapirLoweringPasses(MPM); +} + +static void AddTapirLoweringPasses(ModulePassManager &MPM) { + // First, handle Tapir loops. Loops are handled by first canonicalizing their + // representation and then performing LoopSpawning to ensure that iterations + // are spawned efficiently in parallel. + if (Optimize) { + FunctionPassManager FPM; + LoopPassManager LPM1, LPM2; + // Start by simplifying the loops. + LPM1.addPass(LoopInstSimplifyPass()); + LPM1.addPass(LoopSimplifyCFGPass()); + // Hoist loop invariants + LPM1.addPass(LICMPass(/*LicmMssaOptCap*/ 100, + /*LicmMssaNoAccForPromotionCap*/ 250, + /*AllowSpeculation=*/true)); + // Cleanup the CFG and instructions + FPM.addPass( + RequireAnalysisPass()); + FPM.addPass( + createFunctionToLoopPassAdaptor(std::move(LPM1), /*UseMemorySSA=*/true, + /*UseBlockFrequencyInfo=*/true)); + FPM.addPass(SimplifyCFGPass()); + FPM.addPass(InstCombinePass()); + // Re-rotate loops in all our loop nests. + LPM2.addPass(LoopRotatePass(/* Disable header duplication */ true, + /* isLTOPreLink */ false)); + // Simplify the loop induction variables. + LPM2.addPass(IndVarSimplifyPass()); + FPM.addPass( + createFunctionToLoopPassAdaptor(std::move(LPM2), + /*UseMemorySSA=*/false, + /*UseBlockFrequencyInfo=*/false)); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + + // Transform Tapir loops to ensure that iterations are spawned efficiently + // in parallel. + if (TheTapirTarget != TapirTargetID::None) { + MPM.addPass(LoopSpawningPass()); + // The LoopSpawning pass may leave cruft around. Clean it up. + MPM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass())); + } + + if (PrintIRAfterTapirLoopSpawning(PrintIRLvl)) { + MPM.addPass(PrintModulePass(errs(), "IR dump after Tapir loop spawning")); + } + } + + // Second, lower Tapir constructs in general to some parallel runtime system, + // as specified in TargetLibraryInfo. + + // Add pass to lower Tapir to the target runtime. + if (TheTapirTarget != TapirTargetID::None) { + MPM.addPass(TapirToTargetPass()); + + if (Optimize) { + FunctionPassManager FPM; + FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); + FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); + FPM.addPass(SimplifyCFGPass()); + FPM.addPass(InstCombinePass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + + // Perform some cleanup after the lowering pass. + MPM.addPass(AlwaysInlinerPass( + /*InsertLifetimeIntrinsics=*/false)); + MPM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass())); + + if (PrintIRAfterTapirLowering(PrintIRLvl)) { + MPM.addPass(PrintModulePass(errs(), "IR dump after Tapir lowering")); + } + } +} + +static void InitializeModule() { + // Open a new module. + TheContext = std::make_unique(); + std::string ModuleName; + static size_t Counter = 0; + raw_string_ostream(ModuleName) << "my_module." << Counter++; + TheModule = std::make_unique(ModuleName, *TheContext); + + // Set the target triple to match the system. + auto SysTargetTriple = sys::getDefaultTargetTriple(); + TheModule->setTargetTriple(SysTargetTriple); + // Set an appropriate data layout + TheModule->setDataLayout(TheJIT->getDataLayout()); + + // Create a new builder for the module. + Builder = std::make_unique>(*TheContext); +} + +static void HandleDefinition() { + if (auto FnAST = ParseDefinition()) { + if (auto *FnIR = FnAST->codegen()) { + ExitOnErr(TheJIT->addModule( + ThreadSafeModule(std::move(TheModule), std::move(TheContext)))); + InitializeModule(); + } + } else { + // Skip token for error recovery. + getNextToken(); + } +} + +static void HandleExtern() { + if (auto ProtoAST = ParseExtern()) { + if (auto *FnIR = ProtoAST->codegen()) { + FunctionProtos[ProtoAST->getName()] = std::move(ProtoAST); + } + } else { + // Skip token for error recovery. + getNextToken(); + } +} + +static void HandleTopLevelExpression() { + // Evaluate a top-level expression into an anonymous function. + if (auto FnAST = ParseTopLevelExpr()) { + if (FnAST->codegen()) { + // Create a ResourceTracker to track JIT'd memory allocated to our + // anonymous expression -- that way we can free it after executing. + auto RT = TheJIT->getMainJITDylib().createResourceTracker(); + + auto TSM = ThreadSafeModule(std::move(TheModule), std::move(TheContext)); + ExitOnErr(TheJIT->addModule(std::move(TSM), RT)); + InitializeModule(); + + // Search the JIT for the __anon_expr symbol. + auto ExprSymbol = ExitOnErr(TheJIT->lookup("__anon_expr")); + + // Run initializers. + ExitOnErr(TheJIT->initialize()); + + std::unique_ptr T = + std::make_unique("__anon_expr", "Top-level expression"); + // Get the symbol's address and cast it to the right type (takes no + // arguments, returns a double) so we can call it as a native function. + double (*FP)() = ExprSymbol.getAddress().toPtr(); + T->startTimer(); + double Result = FP(); + T->stopTimer(); + fprintf(stderr, "Evaluated to %f\n", Result); + + // Delete the anonymous expression module from the JIT. + ExitOnErr(RT->remove()); + } + } else { + // Skip token for error recovery. + getNextToken(); + } +} + +/// top ::= definition | external | expression | ';' +static void MainLoop() { + while (true) { + switch (CurTok) { + case tok_eof: + return; + case ';': // ignore top-level semicolons. + getNextToken(); + break; + case tok_def: + HandleDefinition(); + break; + case tok_extern: + HandleExtern(); + break; + default: + HandleTopLevelExpression(); + break; + } + fprintf(stderr, "ready> "); + } +} + +//===----------------------------------------------------------------------===// +// Main driver code. +//===----------------------------------------------------------------------===// + +static int usage(char *argv[]) { + errs() << "Usage: " << argv[0] + << " [-h|--help]" + << " [--lower-tapir-to {cilk|none}]" + << " [--run-cilksan]" + << " [--print-ir {before-opt|before-tapir-lowering|after-tapir-loop-spawning|after-tapir-lowering|all}]" + << " [-O[0-3]]" + << "\n"; + return 1; +} + +int main(int argc, char *argv[]) { + // Set the default Tapir target to be OpenCilk. + TheTapirTarget = TapirTargetID::OpenCilk; + + // Parse command-line arguments + for (int i = 1; i < argc; ++i) { + if (std::string(argv[i]) == "-h" || std::string(argv[i]) == "--help") { + return usage(argv); + } else if (std::string(argv[i]) == "--lower-tapir-to") { + std::string targetStr = std::string(argv[++i]); + if (targetStr == "cilk") { + TheTapirTarget = TapirTargetID::OpenCilk; + } else if (targetStr == "none") { + TheTapirTarget = TapirTargetID::None; + } else { + return usage(argv); + } + } else if (std::string(argv[i]) == "--run-cilksan") { + RunCilksan = true; + } else if (std::string(argv[i]) == "--print-ir") { + // PrintIR = true; + std::string level = std::string(argv[++i]); + if (level == "before-opt" || level == "all") { + PrintIRLvl = setPrintIRBeforeOpt(PrintIRLvl); + } + if (level == "before-tapir-lowering" || level == "all") { + PrintIRLvl = setPrintIRBeforeTapirLowering(PrintIRLvl); + } + if (level == "after-tapir-loop-spawning" || level == "all") { + PrintIRLvl = setPrintIRAfterTapirLoopSpawning(PrintIRLvl); + } + if (level == "after-tapir-lowering" || level == "all") { + PrintIRLvl = setPrintIRAfterTapirLowering(PrintIRLvl); + } + if (PrintIRLvl == PrintIR_None) + return usage(argv); + } else if (std::string(argv[i]) == "-O0") { + Optimize = false; + } else if ((std::string(argv[i]) == "-O1") || + (std::string(argv[i]) == "-O2") || + (std::string(argv[i]) == "-O3")) { + Optimize = true; + } else { + return usage(argv); + } + } + + // Get the system architecture name. + Triple SysTriple(sys::getDefaultTargetTriple()); + StringRef ArchName = SysTriple.getArchName(); + + if (TapirTargetID::OpenCilk == TheTapirTarget) { + // Set the path to the OpenCilk runtime-ABI bitcode file. + std::optional Path = sys::Process::FindInEnvPath( + "LIBRARY_PATH", ("libopencilk-abi-" + ArchName + ".bc").str()); + if (!Path.has_value()) + Path = sys::Process::FindInEnvPath("LIBRARY_PATH", "libopencilk-abi.bc"); + if (!Path.has_value()) { + errs() << "Error: Cannot find OpenCilk runtime-ABI bitcode file " + "LIBRARY_PATH.\n"; + return 1; + } + OpenCilkRuntimeBCPath = *Path; + } + + InitializeNativeTarget(); + InitializeNativeTargetAsmPrinter(); + InitializeNativeTargetAsmParser(); + + // Install standard binary operators. + // 1 is lowest precedence. + BinopPrecedence['='] = 2; + BinopPrecedence['<'] = 10; + BinopPrecedence['+'] = 20; + BinopPrecedence['-'] = 20; + BinopPrecedence['*'] = 40; // highest. + + // Prime the first token. + fprintf(stderr, "ready> "); + getNextToken(); + + TheJIT = ExitOnErr(KaleidoscopeJIT::Create()); + + if (TapirTargetID::OpenCilk == TheTapirTarget) { + // Link the OpenCilk runtime library. + std::optional Path = sys::Process::FindInEnvPath( + "LD_LIBRARY_PATH", + ("libopencilk-personality-c-" + ArchName + ".so").str()); + if (!Path.has_value()) + Path = sys::Process::FindInEnvPath("LD_LIBRARY_PATH", + "libopencilk-personality-c.so"); + if (!Path.has_value()) { + errs() << "Error: Cannot find OpenCilk runtime library in " + "LD_LIBRARY_PATH.\n"; + return 1; + } + TheJIT->loadLibrary(Path->c_str()); + Path = sys::Process::FindInEnvPath( + "LD_LIBRARY_PATH", ("libopencilk-" + ArchName + ".so").str()); + if (!Path.has_value()) + Path = sys::Process::FindInEnvPath("LD_LIBRARY_PATH", "libopencilk.so"); + if (!Path.has_value()) { + errs() << "Error: Cannot find OpenCilk runtime library in " + "LD_LIBRARY_PATH.\n"; + return 1; + } + TheJIT->loadLibrary(Path->c_str()); + } + + if (RunCilksan) { + // Add the Cilksan runtime library. + std::optional Path = sys::Process::FindInEnvPath( + "LD_LIBRARY_PATH", ("libclang_rt.cilksan-" + ArchName + ".so").str()); + if (!Path.has_value()) + Path = sys::Process::FindInEnvPath("LD_LIBRARY_PATH", + "libclang_rt.cilksan.so"); + if (!Path.has_value()) { + errs() + << "Error: Cannot find Cilksan runtime library in LD_LIBRARY_PATH.\n"; + return 1; + } + TheJIT->loadLibrary(Path->c_str()); + } + + InitializeModule(); + + // Run the main "interpreter loop" now. + MainLoop(); + + return 0; +} diff --git a/llvm/examples/Kaleidoscope/lib/toylib.c b/llvm/examples/Kaleidoscope/lib/toylib.c new file mode 100644 index 000000000000000..f38b1a3dbc2009a --- /dev/null +++ b/llvm/examples/Kaleidoscope/lib/toylib.c @@ -0,0 +1,24 @@ +#include +#include + +//===----------------------------------------------------------------------===// +// "Library" functions that can be "extern'd" from user code. +//===----------------------------------------------------------------------===// + +#ifdef _WIN32 +#define DLLEXPORT __declspec(dllexport) +#else +#define DLLEXPORT +#endif + +/// putchard - putchar that takes a double and returns 0. +DLLEXPORT double putchard(double X) { + fputc((char)X, stderr); + return 0; +} + +/// printd - printf that takes a double prints it as "%f\n", returning 0. +DLLEXPORT double printd(double X) { + fprintf(stderr, "%f\n", X); + return 0; +} diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h index 223d8efe57daaec..6f9c20ecafe5c55 100644 --- a/llvm/include/llvm-c/Core.h +++ b/llvm/include/llvm-c/Core.h @@ -142,7 +142,12 @@ typedef enum { LLVMCatchRet = 62, LLVMCatchPad = 63, LLVMCleanupPad = 64, - LLVMCatchSwitch = 65 + LLVMCatchSwitch = 65, + + /* Parallel operators */ + LLVMDetach = 69, + LLVMReattach = 70, + LLVMSync = 71, } LLVMOpcode; typedef enum { @@ -1887,6 +1892,9 @@ unsigned LLVMGetTargetExtTypeIntParam(LLVMTypeRef TargetExtTy, unsigned Idx); macro(CatchReturnInst) \ macro(CatchSwitchInst) \ macro(CallBrInst) \ + macro(DetachInst) \ + macro(ReattachInst) \ + macro(SyncInst) \ macro(FuncletPadInst) \ macro(CatchPadInst) \ macro(CleanupPadInst) \ @@ -4212,6 +4220,18 @@ LLVMValueRef LLVMBuildCatchSwitch(LLVMBuilderRef B, LLVMValueRef ParentPad, LLVMBasicBlockRef UnwindBB, unsigned NumHandlers, const char *Name); +/* Tapir */ +LLVMValueRef LLVMBuildDetach(LLVMBuilderRef B, + LLVMBasicBlockRef DetachBB, + LLVMBasicBlockRef ContinueBB, + LLVMValueRef SyncRegion); +LLVMValueRef LLVMBuildReattach(LLVMBuilderRef B, + LLVMBasicBlockRef ReattachBB, + LLVMValueRef SyncRegion); +LLVMValueRef LLVMBuildSync(LLVMBuilderRef B, + LLVMBasicBlockRef ContinueBB, + LLVMValueRef SyncRegion); + /* Add a case to the switch instruction */ void LLVMAddCase(LLVMValueRef Switch, LLVMValueRef OnVal, LLVMBasicBlockRef Dest); diff --git a/llvm/include/llvm-c/Transforms/Tapir.h b/llvm/include/llvm-c/Transforms/Tapir.h new file mode 100644 index 000000000000000..c2a363353d1e2c7 --- /dev/null +++ b/llvm/include/llvm-c/Transforms/Tapir.h @@ -0,0 +1,46 @@ +/*===- Tapir.h - Tapir Transformation Library C Interface -------*- C++ -*-===*\ +|* *| +|* Part of the LLVM Project, under the Apache License v2.0 with LLVM *| +|* Exceptions. *| +|* See https://llvm.org/LICENSE.txt for license information. *| +|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception *| +|* *| +|*===----------------------------------------------------------------------===*| +|* *| +|* This header declares the C interface to libLLVMTapirOpts.a, which *| +|* implements various Tapir transformations of the LLVM IR. *| +|* *| +|* Many exotic languages can interoperate with C code but have a harder time *| +|* with C++ due to name mangling. So in addition to C, this interface enables *| +|* tools written in such languages. *| +|* *| +\*===----------------------------------------------------------------------===*/ + +#ifndef LLVM_C_TRANSFORMS_TAPIR_H +#define LLVM_C_TRANSFORMS_TAPIR_H + +#include "llvm-c/ExternC.h" +#include "llvm-c/Types.h" + +LLVM_C_EXTERN_C_BEGIN + +/** + * @defgroup LLVMCTransformsTapir Tapir transformations + * @ingroup LLVMCTransforms + * + * @{ + */ + +/** See llvm::createLowerTapirToTargetPass function. */ +void LLVMAddLowerTapirToTargetPass(LLVMPassManagerRef PM); + +/** See llvm::createLoopSpawningPass function. */ +void LLVMAddLoopSpawningPass(LLVMPassManagerRef PM); + +/** + * @} + */ + +LLVM_C_EXTERN_C_END + +#endif diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h index 1b5a6ee24b86109..ff3817815bff999 100644 --- a/llvm/include/llvm/Analysis/AliasAnalysis.h +++ b/llvm/include/llvm/Analysis/AliasAnalysis.h @@ -59,10 +59,12 @@ class BasicBlock; class CatchPadInst; class CatchReturnInst; class DominatorTree; +class DetachInst; class FenceInst; class Function; class LoopInfo; class PreservedAnalyses; +class SyncInst; class TargetLibraryInfo; class Value; @@ -303,6 +305,10 @@ class AAQueryInfo { /// passes that lazily update the DT while performing AA queries. bool UseDominatorTree = true; + /// Whether the instructions corresponding with this query should be + /// considered as part of the same spindle. + bool AssumeSameSpindle = false; + AAQueryInfo(AAResults &AAR, CaptureInfo *CI) : AAR(AAR), CI(CI) {} }; @@ -356,6 +362,11 @@ class AAResults { /// alias analysis implementations. AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB); + /// Version of alias() method where the assumption is explicitly stated of + /// whether the query applies to operations within the same spindle. + AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB, + bool AssumeSameSpindle); + /// A convenience wrapper around the primary \c alias interface. AliasResult alias(const Value *V1, LocationSize V1Size, const Value *V2, LocationSize V2Size) { @@ -525,6 +536,8 @@ class AAResults { /// Return information about whether a call and an instruction may refer to /// the same memory locations. ModRefInfo getModRefInfo(const Instruction *I, const CallBase *Call); + ModRefInfo getModRefInfo(const Instruction *I, const CallBase *Call, + bool AssumeSameSpindle); /// Return information about whether a particular call site modifies /// or reads the specified memory location \p MemLoc before instruction \p I @@ -602,6 +615,10 @@ class AAResults { AAQueryInfo &AAQI); ModRefInfo getModRefInfo(const CatchReturnInst *I, const MemoryLocation &Loc, AAQueryInfo &AAQI); + ModRefInfo getModRefInfo(const DetachInst *D, const MemoryLocation &Loc, + AAQueryInfo &AAQI); + ModRefInfo getModRefInfo(const SyncInst *S, const MemoryLocation &Loc, + AAQueryInfo &AAQI); ModRefInfo getModRefInfo(const Instruction *I, const std::optional &OptLoc, AAQueryInfo &AAQIP); @@ -610,6 +627,12 @@ class AAResults { AAQueryInfo &AAQIP); MemoryEffects getMemoryEffects(const CallBase *Call, AAQueryInfo &AAQI); + /// Return the behavior for the task detached from a given detach instruction. + MemoryEffects getMemoryEffects(const DetachInst *D, AAQueryInfo &AAQI); + + /// Return the behavior for a sync instruction. + MemoryEffects getMemoryEffects(const SyncInst *S, AAQueryInfo &AAQI); + private: class Concept; @@ -651,6 +674,14 @@ class BatchAAResults { bool IgnoreLocals = false) { return AA.getModRefInfoMask(Loc, AAQI, IgnoreLocals); } + ModRefInfo getModRefInfo(const CallBase *Call1, const CallBase *Call2, + bool AssumeSameSpindle) { + bool OldAssumeSameSpindle = AAQI.AssumeSameSpindle; + AAQI.AssumeSameSpindle = AssumeSameSpindle; + auto Result = AA.getModRefInfo(Call1, Call2, AAQI); + AAQI.AssumeSameSpindle = OldAssumeSameSpindle; + return Result; + } ModRefInfo getModRefInfo(const Instruction *I, const std::optional &OptLoc) { return AA.getModRefInfo(I, OptLoc, AAQI); @@ -658,6 +689,14 @@ class BatchAAResults { ModRefInfo getModRefInfo(const Instruction *I, const CallBase *Call2) { return AA.getModRefInfo(I, Call2, AAQI); } + ModRefInfo getModRefInfo(Instruction *I, const CallBase *Call2, + bool AssumeSameSpindle) { + bool OldAssumeSameSpindle = AAQI.AssumeSameSpindle; + AAQI.AssumeSameSpindle = AssumeSameSpindle; + auto Result = AA.getModRefInfo(I, Call2, AAQI); + AAQI.AssumeSameSpindle = OldAssumeSameSpindle; + return Result; + } ModRefInfo getArgModRefInfo(const CallBase *Call, unsigned ArgIdx) { return AA.getArgModRefInfo(Call, ArgIdx); } @@ -863,6 +902,11 @@ class AAResultBase { /// Return true if this pointer is returned by a noalias function. bool isNoAliasCall(const Value *V); +/// Return true if this pointer is returned by a noalias function or, if one +/// assumes the query pertains to operations in the same spindle, a +/// strand_noalias function. +bool isNoAliasCallIfInSameSpindle(const Value *V); + /// Return true if this pointer refers to a distinct and identifiable object. /// This returns true for: /// Global Variables and Functions (but not Global Aliases) @@ -872,6 +916,14 @@ bool isNoAliasCall(const Value *V); /// bool isIdentifiedObject(const Value *V); +/// Return true if this pointer refers to a distinct and identifiable object +/// when the query occurs between operations in the same spindle. +/// This returns true for: +/// Every value for which isIdentifiedObject(V) returns true +/// StrandNoAlias returns +/// +bool isIdentifiedObjectIfInSameSpindle(const Value *V); + /// Return true if V is umabigously identified at the function-level. /// Different IdentifiedFunctionLocals can't alias. /// Further, an IdentifiedFunctionLocal can not alias with any function diff --git a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h index 7eca82729430ddd..1169b87bd6f7867 100644 --- a/llvm/include/llvm/Analysis/BasicAliasAnalysis.h +++ b/llvm/include/llvm/Analysis/BasicAliasAnalysis.h @@ -146,6 +146,10 @@ class BasicAAResult : public AAResultBase { const Value *V2, LocationSize V2Size, AAQueryInfo &AAQI, const Value *O1, const Value *O2); + + AliasResult checkInjectiveArguments(const Value *V1, const Value *O1, + const Value *V2, const Value *O2, + AAQueryInfo &AAQI); }; /// Analysis pass providing a never-invalidated alias analysis result. diff --git a/llvm/include/llvm/Analysis/CodeMetrics.h b/llvm/include/llvm/Analysis/CodeMetrics.h index d09018daf954884..c16ed0301a90128 100644 --- a/llvm/include/llvm/Analysis/CodeMetrics.h +++ b/llvm/include/llvm/Analysis/CodeMetrics.h @@ -24,6 +24,7 @@ class Instruction; class Loop; class Function; template class SmallPtrSetImpl; +class TargetLibraryInfo; class TargetTransformInfo; class Value; @@ -63,6 +64,9 @@ struct CodeMetrics { /// Keep track of the number of calls to 'big' functions. unsigned NumCalls = false; + /// Keep track of the number of calls to 'builtin' functions. + unsigned NumBuiltinCalls = 0; + /// The number of calls to internal functions with a single caller. /// /// These are likely targets for future inlining, likely exposed by @@ -80,7 +84,8 @@ struct CodeMetrics { /// Add information about a block to the current state. void analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI, const SmallPtrSetImpl &EphValues, - bool PrepareForLTO = false, const Loop *L = nullptr); + bool PrepareForLTO = false, const Loop* L = nullptr, + TargetLibraryInfo *TLI = nullptr); /// Collect a loop's ephemeral values (those used only by an assume /// or similar intrinsics in the loop). diff --git a/llvm/include/llvm/Analysis/DataRaceFreeAliasAnalysis.h b/llvm/include/llvm/Analysis/DataRaceFreeAliasAnalysis.h new file mode 100644 index 000000000000000..acdd7508ad41493 --- /dev/null +++ b/llvm/include/llvm/Analysis/DataRaceFreeAliasAnalysis.h @@ -0,0 +1,85 @@ +//===- DataRaceFreeAliasAnalysis.h - DRF-based AA ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This is the interface for an alias analysis based on the assumption that +/// a Tapir program is data-race free. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_DATARACEFREEALIASANALYSIS_H +#define LLVM_ANALYSIS_DATARACEFREEALIASANALYSIS_H + +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" + +namespace llvm { + +class TaskInfo; +class MemoryLocation; + +extern cl::opt EnableDRFAA; + +/// A simple alias analysis implementation that implements the assumption that +/// the Tapir program is data-race free. This analysis uses TaskInfo to +/// determine which may-aliasing instructions may happen in parallel. If two +/// that may alias instructions may happen in parallel and the instructions are +/// not otherwise marked atomic, then the data-race-free assumption asserts that +/// they do not alias. +class DRFAAResult : public AAResultBase { + TaskInfo &TI; + +public: + explicit DRFAAResult(TaskInfo &TI) : AAResultBase(), TI(TI) {} + DRFAAResult(DRFAAResult &&Arg) : AAResultBase(std::move(Arg)), TI(Arg.TI) {} + + /// Handle invalidation events in the new pass manager. + bool invalidate(Function &Fn, const PreservedAnalyses &PA, + FunctionAnalysisManager::Invalidator &Inv); + + AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB, + AAQueryInfo &AAQI, const Instruction *CtxI); + ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc, + AAQueryInfo &AAQI); + ModRefInfo getModRefInfo(const CallBase *Call1, const CallBase *Call2, + AAQueryInfo &AAQI); +}; + +/// Analysis pass providing a never-invalidated alias analysis result. +class DRFAA : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; + +public: + using Result = DRFAAResult; + + DRFAAResult run(Function &F, FunctionAnalysisManager &AM); +}; + +/// Legacy wrapper pass to provide the DRFAAResult object. +class DRFAAWrapperPass : public FunctionPass { + std::unique_ptr Result; + +public: + static char ID; + + DRFAAWrapperPass(); + + DRFAAResult &getResult() { return *Result; } + const DRFAAResult &getResult() const { return *Result; } + + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +/// Creates an instance of \c DRFAAWrapperPass. +FunctionPass *createDRFAAWrapperPass(); + +} // end namespace llvm + +#endif // LLVM_ANALYSIS_DATARACEFREEALIASANALYSIS_H diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h index f0a09644e0f4b65..7167a9c3543dfab 100644 --- a/llvm/include/llvm/Analysis/DependenceAnalysis.h +++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h @@ -40,6 +40,8 @@ #define LLVM_ANALYSIS_DEPENDENCEANALYSIS_H #include "llvm/ADT/SmallBitVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" @@ -288,6 +290,47 @@ namespace llvm { friend class DependenceInfo; }; + struct GeneralAccess { + Instruction *I = nullptr; + std::optional Loc; + unsigned OperandNum = unsigned(-1); + ModRefInfo ModRef = ModRefInfo::NoModRef; + + GeneralAccess() = default; + GeneralAccess(Instruction *I, std::optional Loc, + unsigned OperandNum, ModRefInfo MRI) + : I(I), Loc(Loc), OperandNum(OperandNum), ModRef(MRI) {} + GeneralAccess(Instruction *I, std::optional Loc, + ModRefInfo MRI) + : I(I), Loc(Loc), ModRef(MRI) {} + + bool isValid() const { + return (I && Loc); + } + const Value *getPtr() const { + if (!Loc) + return nullptr; + return Loc->Ptr; + } + bool isRef() const { + return isRefSet(ModRef); + } + bool isMod() const { + return isModSet(ModRef); + } + + inline bool operator==(const GeneralAccess &RHS) { + if (!isValid() && !RHS.isValid()) + return true; + if (!isValid() || !RHS.isValid()) + return false; + return (I == RHS.I) && (Loc == RHS.Loc) && + (OperandNum == RHS.OperandNum) && (ModRef == RHS.ModRef); + } + }; + + raw_ostream &operator<<(raw_ostream &OS, const GeneralAccess &GA); + /// DependenceInfo - This class is the main dependence-analysis driver. /// class DependenceInfo { @@ -354,6 +397,17 @@ namespace llvm { Function *getFunction() const { return F; } + AAResults *getAA() const { return AA; } + + /// depends - Tests for a dependence between the general accesses SrcA and + /// DstA. Returns NULL if no dependence; otherwise, returns a Dependence + /// (or a FullDependence) with as much information as can be gleaned. The + /// flag PossiblyLoopIndependent should be set by the caller if it appears + /// that control flow can reach from Src to Dst without traversing a loop + /// back edge. + std::unique_ptr depends(GeneralAccess *SrcA, + GeneralAccess *DstA, + bool PossiblyLoopIndependent); private: AAResults *AA; ScalarEvolution *SE; @@ -531,6 +585,7 @@ namespace llvm { const Instruction *Dst); unsigned CommonLevels, SrcLevels, MaxLevels; + const Loop *CommonLoop; /// mapSrcLoop - Given one of the loops containing the source, return /// its level index in our numbering scheme. @@ -544,6 +599,11 @@ namespace llvm { /// in LoopNest. bool isLoopInvariant(const SCEV *Expression, const Loop *LoopNest) const; + /// isTrueAtLoopEntry - Returns true if the predicate LHS `Pred` RHS is true + /// at entry of L. + bool isTrueAtLoopEntry(const Loop *L, ICmpInst::Predicate Pred, + const SCEV *LHS, const SCEV *RHS) const; + /// Makes sure all subscript pairs share the same integer type by /// sign-extending as necessary. /// Sign-extending a subscript is safe because getelementptr assumes the @@ -580,7 +640,8 @@ namespace llvm { /// extensions and symbolics. bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X, - const SCEV *Y) const; + const SCEV *Y, + const Loop *L = nullptr) const; /// isKnownLessThan - Compare to see if S is less than Size /// Another wrapper for isKnownNegative(S - max(Size, 1)) with some extra @@ -968,6 +1029,28 @@ namespace llvm { const SCEV *DstAccessFn, SmallVectorImpl &SrcSubscripts, SmallVectorImpl &DstSubscripts); + /// Given a linear access function, tries to recover subscripts + /// for each dimension of the array element access. + bool tryDelinearize(GeneralAccess *SrcA, GeneralAccess *DstA, + SmallVectorImpl &Pair); + + /// Tries to delinearize access function for a fixed size multi-dimensional + /// array, by deriving subscripts from GEP instructions. Returns true upon + /// success and false otherwise. + bool tryDelinearizeFixedSize(GeneralAccess *SrcA, GeneralAccess *DstA, + const SCEV *SrcAccessFn, + const SCEV *DstAccessFn, + SmallVectorImpl &SrcSubscripts, + SmallVectorImpl &DstSubscripts); + + /// Tries to delinearize access function for a multi-dimensional array with + /// symbolic runtime sizes. + /// Returns true upon success and false otherwise. + bool tryDelinearizeParametricSize( + GeneralAccess *SrcA, GeneralAccess *DstA, const SCEV *SrcAccessFn, + const SCEV *DstAccessFn, SmallVectorImpl &SrcSubscripts, + SmallVectorImpl &DstSubscripts); + /// checkSubscript - Helper function for checkSrcSubscript and /// checkDstSubscript to avoid duplicate code bool checkSubscript(const SCEV *Expr, const Loop *LoopNest, diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 95a74b91f7acbf5..4f74bdc1ce1ff5c 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -184,9 +184,9 @@ class MemoryDepChecker { MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L, const DenseMap &SymbolicStrides, - unsigned MaxTargetVectorWidthInBits) + unsigned MaxTargetVectorWidthInBits, TaskInfo *TI = nullptr) : PSE(PSE), InnermostLoop(L), SymbolicStrides(SymbolicStrides), - MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {} + MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits), TI(TI) {} /// Register the location (instructions are given increasing numbers) /// of a write access. @@ -338,6 +338,9 @@ class MemoryDepChecker { std::pair> PointerBounds; + /// Optional TaskInfo + TaskInfo *TI; + /// Check whether there is a plausible dependence between the two /// accesses. /// @@ -625,7 +628,7 @@ class LoopAccessInfo { public: LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetTransformInfo *TTI, const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, - LoopInfo *LI); + LoopInfo *LI, TaskInfo *TI = nullptr); /// Return true we can analyze the memory accesses in the loop and there are /// no memory dependence cycles. Note that for dependences between loads & @@ -713,7 +716,7 @@ class LoopAccessInfo { /// Analyze the loop. Returns true if all memory access in the loop can be /// vectorized. bool analyzeLoop(AAResults *AA, LoopInfo *LI, const TargetLibraryInfo *TLI, - DominatorTree *DT); + DominatorTree *DT, TaskInfo *TI); /// Check if the structure of the loop allows it to be analyzed by this /// pass. @@ -852,12 +855,13 @@ class LoopAccessInfoManager { LoopInfo &LI; TargetTransformInfo *TTI; const TargetLibraryInfo *TLI = nullptr; + TaskInfo *TI = nullptr; public: LoopAccessInfoManager(ScalarEvolution &SE, AAResults &AA, DominatorTree &DT, LoopInfo &LI, TargetTransformInfo *TTI, - const TargetLibraryInfo *TLI) - : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI) {} + const TargetLibraryInfo *TLI, TaskInfo *TI) + : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI), TI(TI) {} const LoopAccessInfo &getInfo(Loop &L); diff --git a/llvm/include/llvm/Analysis/LoopAnalysisManager.h b/llvm/include/llvm/Analysis/LoopAnalysisManager.h index a760ea98d7cfbe3..35785ac13da1b0c 100644 --- a/llvm/include/llvm/Analysis/LoopAnalysisManager.h +++ b/llvm/include/llvm/Analysis/LoopAnalysisManager.h @@ -45,6 +45,7 @@ class MemorySSA; class ScalarEvolution; class TargetLibraryInfo; class TargetTransformInfo; +class TaskInfo; /// The adaptor from a function pass to a loop pass computes these analyses and /// makes them available to the loop passes "for free". Each loop pass is @@ -58,6 +59,7 @@ struct LoopStandardAnalysisResults { ScalarEvolution &SE; TargetLibraryInfo &TLI; TargetTransformInfo &TTI; + TaskInfo &TI; BlockFrequencyInfo *BFI; BranchProbabilityInfo *BPI; MemorySSA *MSSA; diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h index 054206925483c59..ae0998dccd94093 100644 --- a/llvm/include/llvm/Analysis/LoopInfo.h +++ b/llvm/include/llvm/Analysis/LoopInfo.h @@ -62,6 +62,51 @@ class LLVM_EXTERNAL_VISIBILITY Loop : public LoopBase { explicit operator bool() const { return Start && End; } }; + /// Return all blocks inside the loop that have successors outside of the + /// loop. These are the blocks _inside of the current loop_ which branch out. + /// The returned list is always unique. + void getExitingBlocks(SmallVectorImpl &ExitingBlocks, + bool IgnoreDetachUnwind = false) const; + + /// If getExitingBlocks would return exactly one block, return that block. + /// Otherwise return null. + BasicBlock *getExitingBlock(bool IgnoreDetachUnwind = false) const; + + /// Get basic blocks that are outside of the loop, based on CFG analysis, but + /// inside tasks created within the loop. Many analyses and optimizations + /// should treat these blocks as part of the loop. + void getTaskExits(SmallPtrSetImpl &TaskExits) const; + + /// Return all of the successor blocks of this loop. These are the blocks + /// _outside of the current loop_ which are branched to. + void getExitBlocks(SmallVectorImpl &ExitBlocks) const; + + /// If getExitBlocks would return exactly one block, return that block. + /// Otherwise return null. + BasicBlock *getExitBlock() const; + + /// Return true if no exit block for the loop has a predecessor that is + /// outside the loop. + bool hasDedicatedExits() const; + + /// Return all unique successor blocks of this loop. + /// These are the blocks _outside of the current loop_ which are branched to. + void getUniqueExitBlocks(SmallVectorImpl &ExitBlocks) const; + + /// Return all unique successor blocks of this loop except successors from + /// Latch block are not considered. If the exit comes from Latch has also + /// non Latch predecessor in a loop it will be added to ExitBlocks. + /// These are the blocks _outside of the current loop_ which are branched to. + void + getUniqueNonLatchExitBlocks(SmallVectorImpl &ExitBlocks) const; + + /// If getUniqueExitBlocks would return exactly one block, return that block. + /// Otherwise return null. + BasicBlock *getUniqueExitBlock() const; + + /// Return all pairs of (_inside_block_,_outside_block_). + void getExitEdges(SmallVectorImpl &ExitEdges) const; + /// Return true if the specified value is loop invariant. bool isLoopInvariant(const Value *V) const; @@ -372,6 +417,14 @@ class LLVM_EXTERNAL_VISIBILITY Loop : public LoopBase { /// Add llvm.loop.mustprogress to this loop's loop id metadata. void setLoopMustProgress(); + /// Add llvm.loop.from.tapir.loop to this loop's loop id metadata, to indicate + /// that this loop was derived from a Tapir loop. + void setDerivedFromTapirLoop(); + + /// Returns true if the loop was derived from a Tapir loop, according to its + /// metadata. + bool wasDerivedFromTapirLoop() const; + void dump() const; void dumpVerbose() const; diff --git a/llvm/include/llvm/Analysis/LoopIterator.h b/llvm/include/llvm/Analysis/LoopIterator.h index 523d2a21825d0dd..07f3bf7f1137ea8 100644 --- a/llvm/include/llvm/Analysis/LoopIterator.h +++ b/llvm/include/llvm/Analysis/LoopIterator.h @@ -104,6 +104,7 @@ class LoopBlocksDFS { private: Loop *L; + SmallPtrSet TaskExitBlocks; /// Map each block to its postorder number. A block is only mapped after it is /// preorder visited by DFS. It's postorder number is initially zero and set @@ -112,8 +113,14 @@ class LoopBlocksDFS { std::vector PostBlocks; public: - LoopBlocksDFS(Loop *Container) : - L(Container), PostNumbers(NextPowerOf2(Container->getNumBlocks())) { + LoopBlocksDFS(Loop *Container) + : L(Container), PostNumbers(NextPowerOf2(Container->getNumBlocks())) { + PostBlocks.reserve(Container->getNumBlocks()); + } + LoopBlocksDFS(Loop *Container, bool IncludeTaskExits) + : L(Container), PostNumbers(NextPowerOf2(Container->getNumBlocks())) { + if (IncludeTaskExits) + L->getTaskExits(TaskExitBlocks); PostBlocks.reserve(Container->getNumBlocks()); } @@ -123,7 +130,9 @@ class LoopBlocksDFS { void perform(const LoopInfo *LI); /// Return true if postorder numbers are assigned to all loop blocks. - bool isComplete() const { return PostBlocks.size() == L->getNumBlocks(); } + bool isComplete() const { + return PostBlocks.size() == (L->getNumBlocks() + TaskExitBlocks.size()); + } /// Iterate over the cached postorder blocks. POIterator beginPostorder() const { @@ -175,6 +184,8 @@ class LoopBlocksRPO { public: LoopBlocksRPO(Loop *Container) : DFS(Container) {} + LoopBlocksRPO(Loop *Container, bool IncludeTaskExits) + : DFS(Container, IncludeTaskExits) {} /// Traverse the loop blocks and store the DFS result. void perform(const LoopInfo *LI) { @@ -229,7 +240,7 @@ class LoopBlocksTraversal { /// /// TODO: If anyone is interested, we could record preorder numbers here. bool visitPreorder(BasicBlock *BB) { - if (!DFS.L->contains(LI->getLoopFor(BB))) + if (!DFS.L->contains(LI->getLoopFor(BB)) && !DFS.TaskExitBlocks.count(BB)) return false; return DFS.PostNumbers.insert(std::make_pair(BB, 0)).second; diff --git a/llvm/include/llvm/Analysis/MemoryBuiltins.h b/llvm/include/llvm/Analysis/MemoryBuiltins.h index bb282a1b73d320b..f38cee72f8db942 100644 --- a/llvm/include/llvm/Analysis/MemoryBuiltins.h +++ b/llvm/include/llvm/Analysis/MemoryBuiltins.h @@ -118,6 +118,10 @@ std::optional getAllocSize( return V; }); +/// Gets the size arguments for the requested allocation. +std::pair getAllocSizeArgs(const CallBase *CB, + const TargetLibraryInfo *TLI); + /// If this is a call to an allocation function that initializes memory to a /// fixed value, return said value in the requested type. Otherwise, return /// nullptr. diff --git a/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h b/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h index d5b2eb6253db959..829c832c8ef30d4 100644 --- a/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h +++ b/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h @@ -32,6 +32,7 @@ class AssumptionCache; class BatchAAResults; class DominatorTree; class PHITransAddr; +class TaskInfo; /// A memory dependence query can return one of three different answers. class MemDepResult { @@ -357,6 +358,7 @@ class MemoryDependenceResults { DominatorTree &DT; PredIteratorCache PredCache; EarliestEscapeInfo EII; + TaskInfo *TI; unsigned DefaultBlockScanLimit; @@ -367,8 +369,9 @@ class MemoryDependenceResults { public: MemoryDependenceResults(AAResults &AA, AssumptionCache &AC, const TargetLibraryInfo &TLI, DominatorTree &DT, - unsigned DefaultBlockScanLimit) - : AA(AA), AC(AC), TLI(TLI), DT(DT), EII(DT), + unsigned DefaultBlockScanLimit, + TaskInfo* TI = nullptr) + : AA(AA), AC(AC), TLI(TLI), DT(DT), EII(DT), TI(TI), DefaultBlockScanLimit(DefaultBlockScanLimit) {} /// Handle invalidation in the new PM. diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h index ac828021dd2ae1c..2183467ca9a65f0 100644 --- a/llvm/include/llvm/Analysis/MemorySSA.h +++ b/llvm/include/llvm/Analysis/MemorySSA.h @@ -116,6 +116,7 @@ class LLVMContext; class MemoryAccess; class MemorySSAWalker; class Module; +class TaskInfo; class Use; class Value; class raw_ostream; @@ -700,8 +701,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryPhi, MemoryAccess) /// accesses. class MemorySSA { public: - MemorySSA(Function &, AliasAnalysis *, DominatorTree *); - MemorySSA(Loop &, AliasAnalysis *, DominatorTree *); + MemorySSA(Function &, AliasAnalysis *, DominatorTree *, TaskInfo * = nullptr); + MemorySSA(Loop &, AliasAnalysis *, DominatorTree *, TaskInfo * = nullptr); // MemorySSA must remain where it's constructed; Walkers it creates store // pointers to it. @@ -877,6 +878,7 @@ class MemorySSA { DominatorTree *DT; Function *F = nullptr; Loop *L = nullptr; + TaskInfo *TI = nullptr; // Memory SSA mappings DenseMap ValueToMemoryAccess; @@ -920,7 +922,7 @@ class MemorySSAUtil { // This function should not be used by new passes. static bool defClobbersUseOrDef(MemoryDef *MD, const MemoryUseOrDef *MU, - AliasAnalysis &AA); + AliasAnalysis &AA, TaskInfo *TI = nullptr); }; /// An analysis that produces \c MemorySSA for a function. diff --git a/llvm/include/llvm/Analysis/MustExecute.h b/llvm/include/llvm/Analysis/MustExecute.h index 468d94e7cd68b91..016479359c70d91 100644 --- a/llvm/include/llvm/Analysis/MustExecute.h +++ b/llvm/include/llvm/Analysis/MustExecute.h @@ -42,6 +42,7 @@ class Loop; class LoopInfo; class PostDominatorTree; class raw_ostream; +class TaskInfo; /// Captures loop safety information. /// It keep information for loop blocks may throw exception or otherwise @@ -96,6 +97,7 @@ class LoopSafetyInfo { /// least once (under the assumption that the loop is entered). virtual bool isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT, + const TaskInfo *TI, const Loop *CurLoop) const = 0; LoopSafetyInfo() = default; @@ -121,6 +123,7 @@ class SimpleLoopSafetyInfo: public LoopSafetyInfo { bool isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT, + const TaskInfo *TI, const Loop *CurLoop) const override; }; @@ -146,6 +149,7 @@ class ICFLoopSafetyInfo: public LoopSafetyInfo { bool isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT, + const TaskInfo *TI, const Loop *CurLoop) const override; /// Returns true if we could not execute a memory-modifying instruction before diff --git a/llvm/include/llvm/Analysis/SparsePropagation.h b/llvm/include/llvm/Analysis/SparsePropagation.h index d5805a7314757f6..7cee58b38aa848f 100644 --- a/llvm/include/llvm/Analysis/SparsePropagation.h +++ b/llvm/include/llvm/Analysis/SparsePropagation.h @@ -337,6 +337,11 @@ void SparseSolver::getFeasibleSuccessors( return; } + if (isa(TI) || isa(TI) || isa(TI)) { + Succs.assign(Succs.size(), true); + return; + } + SwitchInst &SI = cast(TI); LatticeVal SCValue; if (AggressiveUndef) diff --git a/llvm/include/llvm/Analysis/TapirRaceDetect.h b/llvm/include/llvm/Analysis/TapirRaceDetect.h new file mode 100644 index 000000000000000..9def6b92307125c --- /dev/null +++ b/llvm/include/llvm/Analysis/TapirRaceDetect.h @@ -0,0 +1,365 @@ +//===-- TapirRaceDetect.h - Tapir determinacy-race detection ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// TapirRaceDetect is an LLVM pass that analyses Tapir tasks and dependences +// between memory accesses to find accesses that might race. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_TAPIRRACEDETECT_H +#define LLVM_ANALYSIS_TAPIRRACEDETECT_H + +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" + +namespace llvm { + +class Loop; +class LoopInfo; +class RuntimePointerChecking; +class ScalarEvolution; +class StratABIList; +class TargetLibraryInfo; +class TaskInfo; + +/// RaceInfo +class RaceInfo { +public: + // Possible conditions for a race: + // + // 1) Within the function, two instructions that might execute in parallel + // access aliasing locations, and at least one performs a write. + // + // 2) An instruction reads or writes a location that might alias a global + // variable or function argument. In this case, the race would occur via an + // ancestor of the invocation of this function. + enum RaceType + { + None = 0, + Local = 1, // Possible race via local pointer or control flow. + ViaAncestorRef = 2, // Possible race with ref in caller (e.g., via function + // parameter or global) + ViaAncestorMod = 4, // Possible race with mod inf caller (e.g., via function + // parameter or global) + Opaque = 8, // Possible race via unknown program state (e.g., global data) + }; + + static RaceType setLocalRace(const RaceType RT) { + return RaceType(static_cast(RT) | static_cast(Local)); + } + static RaceType setRaceViaAncestorRef(const RaceType RT) { + return RaceType(static_cast(RT) | + static_cast(ViaAncestorRef)); + } + static RaceType setRaceViaAncestorMod(const RaceType RT) { + return RaceType(static_cast(RT) | + static_cast(ViaAncestorMod)); + } + static RaceType setOpaqueRace(const RaceType RT) { + return RaceType(static_cast(RT) | static_cast(Opaque)); + } + static RaceType clearOpaqueRace(const RaceType RT) { + return RaceType(static_cast(RT) & ~static_cast(Opaque)); + } + static RaceType unionRaceTypes(const RaceType RT1, const RaceType RT2) { + return RaceType(static_cast(RT1) | static_cast(RT2)); + } + + static bool isRace(const RaceType RT) { + return (RaceType::None != RT); + } + static bool isLocalRace(const RaceType RT) { + return (static_cast(RT) & static_cast(RaceType::Local)) == + static_cast(RaceType::Local); + } + static bool isRaceViaAncestor(const RaceType RT) { + return isRaceViaAncestorRef(RT) || isRaceViaAncestorMod(RT); + } + static bool isRaceViaAncestorRef(const RaceType RT) { + return (static_cast(RT) & + static_cast(RaceType::ViaAncestorRef)) == + static_cast(RaceType::ViaAncestorRef); + } + static bool isRaceViaAncestorMod(const RaceType RT) { + return (static_cast(RT) & + static_cast(RaceType::ViaAncestorMod)) == + static_cast(RaceType::ViaAncestorMod); + } + static bool isOpaqueRace(const RaceType RT) { + return (static_cast(RT) & static_cast(RaceType::Opaque)) == + static_cast(RaceType::Opaque); + } + static void printRaceType(RaceInfo::RaceType RT, raw_ostream &OS) { + if (RaceInfo::isLocalRace(RT)) + OS << "Local"; + if (RaceInfo::isRaceViaAncestor(RT)) { + if (RaceInfo::isLocalRace(RT)) + OS << ", "; + OS << "Via Ancestor"; + if (RaceInfo::isRaceViaAncestorMod(RT)) + OS << " Mod"; + if (RaceInfo::isRaceViaAncestorRef(RT)) + OS << " Ref"; + } + if (RaceInfo::isOpaqueRace(RT)) { + if (RaceInfo::isLocalRace(RT) || RaceInfo::isRaceViaAncestor(RT)) + OS << ", "; + OS << "Opaque"; + } + } + + using MemAccessInfo = PointerIntPair; + + // Struct to store data about a race. + struct RaceData { + MemAccessInfo Access = { nullptr, false }; + unsigned OperandNum = static_cast(-1); + RaceType Type = RaceType::None; + GeneralAccess Racer; + + RaceData() = default; + RaceData(MemAccessInfo Access, unsigned OperandNum, const RaceType RT, + GeneralAccess Racer = GeneralAccess()) + : Access(Access), OperandNum(OperandNum), Type(RT), + Racer(Racer) {} + + const Value *getPtr() const { return Access.getPointer(); } + }; + + // Map to store race results. + struct ResultTy : + public DenseMap> { + + void recordRace(const Instruction *I, MemAccessInfo Access, + unsigned OperandNum, const RaceType RT, + const GeneralAccess &Racer) { + if (!count(I)) { + (*this)[I].push_back(RaceData(Access, OperandNum, RT, Racer)); + return; + } + for (RaceData &RD : (*this)[I]) + if ((RD.Access == Access) && (RD.OperandNum == OperandNum) && + (RD.Racer == Racer)) { + RD.Type = unionRaceTypes(RD.Type, RT); + return; + } + (*this)[I].push_back(RaceData(Access, OperandNum, RT, Racer)); + } + void recordLocalRace(const GeneralAccess &GA, + const GeneralAccess &Racer) { + recordRace(GA.I, MemAccessInfo(GA.getPtr(), GA.isMod()), GA.OperandNum, + RaceType::Local, Racer); + } + void recordRaceViaAncestorRef(const GeneralAccess &GA, + const GeneralAccess &Racer) { + recordRace(GA.I, MemAccessInfo(GA.getPtr(), GA.isMod()), GA.OperandNum, + RaceType::ViaAncestorRef, Racer); + } + void recordRaceViaAncestorMod(const GeneralAccess &GA, + const GeneralAccess &Racer) { + recordRace(GA.I, MemAccessInfo(GA.getPtr(), GA.isMod()), GA.OperandNum, + RaceType::ViaAncestorMod, Racer); + } + void recordOpaqueRace(const GeneralAccess &GA, + const GeneralAccess &Racer) { + recordRace(GA.I, MemAccessInfo(GA.getPtr(), GA.isMod()), GA.OperandNum, + RaceType::Opaque, Racer); + } + + RaceType getRaceType(const Instruction *I, + const SmallPtrSetImpl *Filter = nullptr) const { + if (!count(I)) + return RaceType::None; + RaceType RT = RaceType::None; + + // Union the recorded race types + for (RaceData &RD : lookup(I)) { + if (Filter && RD.Racer.isValid() && Filter->count(RD.Racer.I)) + continue; + RT = unionRaceTypes(RD.Type, RT); + } + return RT; + } + + ModRefInfo getLocalRaceModRef( + const Instruction *I, + const SmallPtrSetImpl *Filter = nullptr) const { + if (!count(I)) + return ModRefInfo::NoModRef; + + ModRefInfo MRI = ModRefInfo::NoModRef; + // Union the recorded local race mod-ref info + for (RaceData &RD : lookup(I)) { + if (RaceType::Local != RD.Type) + continue; + if (Filter && RD.Racer.isValid() && Filter->count(RD.Racer.I)) + continue; + if (!RD.Racer.isValid()) + return ModRefInfo::ModRef; + if (RD.Racer.isMod()) + MRI |= ModRefInfo::Mod; + if (RD.Racer.isRef()) + MRI |= ModRefInfo::Ref; + } + return MRI; + } + }; + using ObjectMRTy = DenseMap; + using PtrChecksTy = + DenseMap>; + using AccessToUnderlyingObjMap = + DenseMap>; + + using FilterTy = const SmallPtrSetImpl; + + RaceInfo(Function *F, DominatorTree &DT, LoopInfo &LI, TaskInfo &TI, + DependenceInfo &DI, ScalarEvolution &SE, + const TargetLibraryInfo *TLI); + + const SmallVectorImpl &getRaceData(const Instruction *I) { + return Result[I]; + } + + RaceType getRaceType(const Instruction *I, FilterTy *Filter = nullptr) const { + return Result.getRaceType(I, Filter); + } + bool mightRace(const Instruction *I, FilterTy *Filter = nullptr) const { + return isRace(getRaceType(I, Filter)); + } + bool mightRaceLocally(const Instruction *I, + FilterTy *Filter = nullptr) const { + return isLocalRace(getRaceType(I, Filter)); + } + bool mightRaceViaAncestor(const Instruction *I, + FilterTy *Filter = nullptr) const { + return isRaceViaAncestor(getRaceType(I, Filter)); + } + bool mightRaceViaAncestorRef(const Instruction *I, + FilterTy *Filter = nullptr) const { + return isRaceViaAncestorRef(getRaceType(I, Filter)); + } + bool mightRaceViaAncestorMod(const Instruction *I, + FilterTy *Filter = nullptr) const { + return isRaceViaAncestorMod(getRaceType(I, Filter)); + } + bool mightRaceOpaquely(const Instruction *I, + FilterTy *Filter = nullptr) const { + return isOpaqueRace(getRaceType(I, Filter)); + } + + const ObjectMRTy &getObjectMRForRace() const { + return ObjectMRForRace; + } + bool ObjectInvolvedInRace(const Value *V) const { + return ObjectMRForRace.count(V); + } + ModRefInfo GetObjectMRForRace(const Value *V) const { + if (!ObjectInvolvedInRace(V)) + return ModRefInfo::NoModRef; + return ObjectMRForRace.lookup(V); + } + + RaceType getOverallRaceType() const { + RaceType RT = RaceType::None; + for (auto Res : Result) + for (auto &RD : Res.second) + RT = unionRaceTypes(RT, RD.Type); + return RT; + } + + ModRefInfo getLocalRaceModRef( + const Instruction *I, + const SmallPtrSetImpl *Filter = nullptr) const { + return Result.getLocalRaceModRef(I, Filter); + } + + void getObjectsFor(Instruction *I, SmallPtrSetImpl &Objects); + void getObjectsFor(MemAccessInfo Access, + SmallPtrSetImpl &Objects); + + bool invalidate(Function &F, const PreservedAnalyses &PA, + FunctionAnalysisManager::Invalidator &); + + void print(raw_ostream &) const; + + AAResults *getAA() const { return DI.getAA(); } + ScalarEvolution *getSE() const { return &SE; } + +private: + void analyzeFunction(); + + Function *F; + + // Analyses + DominatorTree &DT; + LoopInfo &LI; + TaskInfo &TI; + DependenceInfo &DI; + ScalarEvolution &SE; + const TargetLibraryInfo *TLI; + + ResultTy Result; + // Map from underlying objects to mod/ref behavior necessary for potential + // race. + ObjectMRTy ObjectMRForRace; + PtrChecksTy AllPtrRtChecks; + + AccessToUnderlyingObjMap AccessToObjs; +}; + +// AnalysisPass +class TapirRaceDetect : public AnalysisInfoMixin { +public: + using Result = RaceInfo; + Result run(Function &F, FunctionAnalysisManager &FAM); + +private: + static AnalysisKey Key; + friend struct AnalysisInfoMixin; +}; // class TapirRaceDetect + +// Printer pass +class TapirRaceDetectPrinterPass + : public PassInfoMixin { +public: + TapirRaceDetectPrinterPass(raw_ostream &OS) : OS(OS) {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); + +private: + raw_ostream &OS; +}; // class TapirRaceDetectPrinterPass + +// Legacy pass manager pass +class TapirRaceDetectWrapperPass : public FunctionPass { +public: + static char ID; + + TapirRaceDetectWrapperPass(); + + bool runOnFunction(Function &F) override; + void releaseMemory() override; + void getAnalysisUsage(AnalysisUsage &) const override; + void print(raw_ostream &, const Module * = nullptr) const override; + RaceInfo &getRaceInfo() const; + +private: + std::unique_ptr Info; +}; // class TapirRaceDetectWrapperPass + +// createTapirRaceDetectWrapperPass - This creates an instance of the +// TapirRaceDetect wrapper pass. +FunctionPass *createTapirRaceDetectWrapperPass(); + +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/Analysis/TapirTargetFuncs.def b/llvm/include/llvm/Analysis/TapirTargetFuncs.def new file mode 100644 index 000000000000000..deea1327485e457 --- /dev/null +++ b/llvm/include/llvm/Analysis/TapirTargetFuncs.def @@ -0,0 +1,23 @@ +//===-- TapirTargetFuncs.def - Library information ----*- C++ -*-----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This .def file will either fill in the enum definition or fill in the +// string representation array definition for TargetLibraryInfo. +// Which is defined depends on whether TLI_DEFINE_ENUM is defined or +// TLI_DEFINE_STRING is defined. Only one should be defined at a time. + +#define TLI_DEFINE_STRING_INTERNAL(string_repr) string_repr, + +#if defined(TLI_DEFINE_CILK_LIBS) +/// unsigned __cilkrts_get_nworkers(void); +TLI_DEFINE_STRING_INTERNAL("__cilkrts_get_nworkers") +/// unsigned __cilkrts_get_worker_number(void); +TLI_DEFINE_STRING_INTERNAL("__cilkrts_get_worker_number") +#endif + +#undef TLI_DEFINE_STRING_INTERNAL diff --git a/llvm/include/llvm/Analysis/TapirTaskInfo.h b/llvm/include/llvm/Analysis/TapirTaskInfo.h new file mode 100644 index 000000000000000..88066904dce84a2 --- /dev/null +++ b/llvm/include/llvm/Analysis/TapirTaskInfo.h @@ -0,0 +1,1545 @@ +//===- TapirTaskInfo.h - Tapir task calculator ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the TapirTaskInfo class that is used to identify parallel +// tasks as represented in Tapir. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_TAPIRTASKINFO_H +#define LLVM_ANALYSIS_TAPIRTASKINFO_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" +#include "llvm/Support/Allocator.h" +#include +#include + +namespace llvm { + +class PHINode; +class Loop; +class raw_ostream; +class Spindle; +class Task; +class TaskInfo; + +//===----------------------------------------------------------------------===// +/// In Tapir, the basic blocks in a function can be partitioned into +/// spindles. A spindle is a connected set of basic blocks with a +/// single entry point for parallel control flow. When executed, all +/// blocks within a spindle are guaranteed to execute sequentially on +/// one worker. +/// +class Spindle { +public: + enum SPType { Entry, Detach, Sync, Phi }; + using SpindleEdge = std::pair; + +private: + SPType Ty; + + Task *ParentTask = nullptr; + + // The list of basic blocks in this spindle. The first entry is the entry + // block of the spindle. + std::vector Blocks; + + SmallPtrSet DenseBlockSet; + + // Predecessor and successor spindles. + SmallVector Incoming; + SmallVector Outgoing; + + // If this spindle starts with a taskframe.create, TaskFrameUser points to the + // task that uses that created taskframe. + Task *TaskFrameUser = nullptr; + Spindle *TaskFrameParent = nullptr; + SetVector SubTaskFrames; + SetVector TaskFrameSubtasks; + SetVector TaskFrameSpindles; + + Spindle(const Spindle &) = delete; + const Spindle &operator=(const Spindle &) = delete; + +public: + BasicBlock *getEntry() const { return getBlocks().front(); } + bool isEntry(const BasicBlock *B) const { return (getBlocks().front() == B); } + Task *getParentTask() const { return ParentTask; } + + void setParentTask(Task *T) { ParentTask = T; } + + SPType getType() const { return Ty; } + bool isSync() const { return Sync == Ty; } + bool isPhi() const { return Phi == Ty; } + + Value *getTaskFrameCreate() const; + Task *getTaskFrameUser() const { return TaskFrameUser; } + Spindle *getTaskFrameParent() const { return TaskFrameParent; } + BasicBlock *getTaskFrameContinuation() const; + /// Return the nesting level of this taskframe. + unsigned getTaskFrameDepth() const { + unsigned D = 0; + for (const Spindle *CurTF = TaskFrameParent; CurTF; + CurTF = CurTF->TaskFrameParent) + ++D; + return D; + } + + Task *getTaskFromTaskFrame() const; + + /// Return true if the specified basic block is in this spindle. + bool contains(const BasicBlock *BB) const { + return DenseBlockSet.count(BB); + } + + /// Return true if the specified instruction is in this spindle. + bool contains(const Instruction *Inst) const { + return contains(Inst->getParent()); + } + + /// Returns true if the given spindle \p S is in the set of this spindle's + /// taskframe spindles. Returns false if this is not a taskframe.create + /// spindle or if \p S is not in the set. + bool taskFrameContains(Spindle *S) const { + return TaskFrameSpindles.count(S); + } + + /// Return true if this spindle is a shared EH spindle. + bool isSharedEH() const; + + /// Return true if this spindle is the continuation of a detached task. + bool isTaskContinuation() const; + + /// Return true if the predecessor spindle Pred is part of a different task + /// from this spindle. + bool predInDifferentTask(const Spindle *Pred) const { + return (getParentTask() != Pred->getParentTask()) && !isSharedEH(); + } + /// Return true if the successor spindle Succ is part of the same task as this + /// spindle. + bool succInSameTask(const Spindle *Succ) const; + + /// Return true if the successor spindle Succ is part of the same task as this + /// spindle. + bool succInSubTask(const Spindle *Succ) const; + + /// Get a list of the basic blocks which make up this task. + ArrayRef getBlocks() const { + return Blocks; + } + using iterator = typename ArrayRef::const_iterator; + iterator block_begin() const { return getBlocks().begin(); } + iterator block_end() const { return getBlocks().end(); } + inline iterator_range blocks() const { + return make_range(block_begin(), block_end()); + } + + /// Get the number of blocks in this task in constant time. + unsigned getNumBlocks() const { + return Blocks.size(); + } + + /// Return a direct, mutable handle to the blocks vector so that we can + /// mutate it efficiently with techniques like `std::remove`. + std::vector &getBlocksVector() { + return Blocks; + } + /// Return a direct, mutable handle to the blocks set so that we can + /// mutate it efficiently. + SmallPtrSetImpl &getBlocksSet() { + return DenseBlockSet; + } + + /// True if terminator in the block can branch to another block that is + /// outside of this spindle. + bool isSpindleExiting(const BasicBlock *BB) const { + if (BB->getTerminator()->getNumSuccessors() == 0) + return true; + for (const auto *Succ : children(BB)) + if (!contains(Succ)) + return true; + return false; + } + + /// Helper class for iterator to walk just the exiting basic blocks of the + /// spindle. + class SpindleExitingFilter { + const Spindle *S = nullptr; + public: + SpindleExitingFilter() {} + SpindleExitingFilter(const Spindle *S) : S(S) {} + bool operator()(const BasicBlock *B) const { + return S->isSpindleExiting(B); + } + }; + inline iterator_range< + filter_iterator::iterator, + SpindleExitingFilter>> spindle_exits() { + return make_filter_range(blocks(), SpindleExitingFilter(this)); + } + inline iterator_range< + filter_iterator::const_iterator, + SpindleExitingFilter>> spindle_exits() const { + return make_filter_range(blocks(), SpindleExitingFilter(this)); + } + + // Iterators for the incoming and outgoing edges of this spindle. + using spedge_iterator = typename SmallVectorImpl::iterator; + using spedge_const_iterator = + typename SmallVectorImpl::const_iterator; + using spedge_range = iterator_range; + using spedge_const_range = iterator_range; + + inline spedge_iterator in_begin() { return Incoming.begin(); } + inline spedge_const_iterator in_begin() const { + return Incoming.begin(); + } + inline spedge_iterator in_end() { return Incoming.end(); } + inline spedge_const_iterator in_end() const { + return Incoming.end(); + } + inline spedge_range in_edges() { + return make_range(in_begin(), in_end()); + } + inline spedge_const_range in_edges() const { + return make_range(in_begin(), in_end()); + } + + inline spedge_iterator out_begin() { return Outgoing.begin(); } + inline spedge_const_iterator out_begin() const { + return Outgoing.begin(); + } + inline spedge_iterator out_end() { return Outgoing.end(); } + inline spedge_const_iterator out_end() const { + return Outgoing.end(); + } + inline spedge_range out_edges() { + return make_range(out_begin(), out_end()); + } + inline spedge_const_range out_edges() const { + return make_range(out_begin(), out_end()); + } + + template + class adj_iterator_impl + : public iterator_adaptor_base< + adj_iterator_impl, SPEdgeIt, + typename std::iterator_traits::iterator_category, + SpindleT, std::ptrdiff_t, SpindleT *, SpindleT> { + + using BaseT = iterator_adaptor_base< + adj_iterator_impl, SPEdgeIt, + typename std::iterator_traits::iterator_category, + SpindleT, std::ptrdiff_t, SpindleT *, SpindleT>; + + public: + adj_iterator_impl(SPEdgeIt Begin) : BaseT(Begin) {} + inline SpindleT operator*() const { return BaseT::I->first; } + }; + + using adj_iterator = adj_iterator_impl<>; + using adj_const_iterator = + adj_iterator_impl; + using adj_range = iterator_range; + using adj_const_range = iterator_range; + + using tf_subtask_iterator = typename SetVector::const_iterator; + using tf_subtask_const_iterator = tf_subtask_iterator; + inline tf_subtask_iterator tf_subtask_begin() const { + return TaskFrameSubtasks.begin(); + } + inline tf_subtask_iterator tf_subtask_end() const { + return TaskFrameSubtasks.end(); + } + inline iterator_range taskframe_subtasks() const { + return make_range(tf_subtask_begin(), tf_subtask_end()); + } + + using subtaskframe_iterator = typename SetVector::const_iterator; + using subtaskframe_const_iterator = subtaskframe_iterator; + inline subtaskframe_iterator subtaskframe_begin() const { + return SubTaskFrames.begin(); + } + inline subtaskframe_iterator subtaskframe_end() const { + return SubTaskFrames.end(); + } + inline iterator_range subtaskframes() const { + return make_range(subtaskframe_begin(), subtaskframe_end()); + } + + using tf_spindle_iterator = typename SetVector::const_iterator; + using tf_spindle_const_iterator = tf_spindle_iterator; + inline tf_spindle_iterator tf_spindle_begin() const { + return TaskFrameSpindles.begin(); + } + inline tf_spindle_iterator tf_spindle_end() const { + return TaskFrameSpindles.end(); + } + inline iterator_range taskframe_spindles() const { + return make_range(tf_spindle_begin(), tf_spindle_end()); + } + + /// Print spindle with all the BBs inside it. + void print(raw_ostream &OS, bool Verbose = false) const; + + /// Raw method to add block B to this spindle. + void addBlock(BasicBlock &B) { + Blocks.push_back(&B); + DenseBlockSet.insert(&B); + } + + // Returns true if the basic block B predeces this spindle. + bool blockPrecedesSpindle(const BasicBlock *B) const { + for (const BasicBlock *SB : successors(B)) + if (SB == getEntry()) + return true; + return false; + } + + // Raw method to add spindle S as a predecessor of this spindle. + void addSpindleEdgeTo(Spindle *Succ, BasicBlock *FromExit) { + assert(contains(FromExit) && + "Cannot add spindle edge from block not in this spindle"); + assert(Succ->blockPrecedesSpindle(FromExit) && + "FromExit must precede successor spindle"); + Outgoing.push_back(SpindleEdge(Succ, FromExit)); + Succ->Incoming.push_back(SpindleEdge(this, FromExit)); + } + +protected: + friend class Task; + friend class TaskInfo; + + /// This creates an empty spindle. + Spindle() = default; + + explicit Spindle(BasicBlock *BB, SPType Ty) : Ty(Ty) { + Blocks.push_back(BB); + DenseBlockSet.insert(BB); + } + + // To allow passes like SCEV to key analysis results off of `Task` pointers, + // we disallow re-use of pointers within a task pass manager. This means task + // passes should not be `delete` ing `Task` objects directly (and risk a later + // `Task` allocation re-using the address of a previous one) but should be + // using TaskInfo::markAsRemoved, which keeps around the `Task` pointer till + // the end of the lifetime of the `TaskInfo` object. + // + // To make it easier to follow this rule, we mark the destructor as + // non-public. + ~Spindle() { + Blocks.clear(); + DenseBlockSet.clear(); + Incoming.clear(); + Outgoing.clear(); + ParentTask = nullptr; + TaskFrameUser = nullptr; + TaskFrameParent = nullptr; + SubTaskFrames.clear(); + TaskFrameSubtasks.clear(); + TaskFrameSpindles.clear(); + } +}; + +raw_ostream &operator<<(raw_ostream &OS, const Spindle &S); + +// Iterators for the predecessors of a Spindle, using the Spindle edges. +using pred_spindle_iterator = typename Spindle::adj_iterator; +using pred_spindle_const_iterator = typename Spindle::adj_const_iterator; +using pred_spindle_range = iterator_range; +using pred_spindle_const_range = iterator_range; + +inline pred_spindle_iterator pred_begin(Spindle *S) { + return pred_spindle_iterator(S->in_begin()); +} +inline pred_spindle_const_iterator pred_begin(const Spindle *S) { + return pred_spindle_const_iterator(S->in_begin()); +} +inline pred_spindle_iterator pred_end(Spindle *S) { + return pred_spindle_iterator(S->in_end()); +} +inline pred_spindle_const_iterator pred_end(const Spindle *S) { + return pred_spindle_const_iterator(S->in_end()); +} +inline pred_spindle_range predecessors(Spindle *S) { + return pred_spindle_range(pred_begin(S), pred_end(S)); +} +inline pred_spindle_const_range predecessors(const Spindle *S) { + return pred_spindle_const_range(pred_begin(S), pred_end(S)); +} + +// Iterators for the successors of a Spindle, using the Spindle edges. +using succ_spindle_iterator = typename Spindle::adj_iterator; +using succ_spindle_const_iterator = typename Spindle::adj_const_iterator; +using succ_spindle_range = iterator_range; +using succ_spindle_const_range = iterator_range; + +inline succ_spindle_iterator succ_begin(Spindle *S) { + return succ_spindle_iterator(S->out_begin()); +} +inline succ_spindle_const_iterator succ_begin(const Spindle *S) { + return succ_spindle_const_iterator(S->out_begin()); +} +inline succ_spindle_iterator succ_end(Spindle *S) { + return succ_spindle_iterator(S->out_end()); +} +inline succ_spindle_const_iterator succ_end(const Spindle *S) { + return succ_spindle_const_iterator(S->out_end()); +} +inline succ_spindle_range successors(Spindle *S) { + return succ_spindle_range(succ_begin(S), succ_end(S)); +} +inline succ_spindle_const_range successors(const Spindle *S) { + return succ_spindle_const_range(succ_begin(S), succ_end(S)); +} + +// Helper class for iterating over spindles within the same task. +class InTaskFilter { + const Spindle *S = nullptr; +public: + InTaskFilter() {} + InTaskFilter(const Spindle *S) : S(S) {} + bool operator()(const Spindle *Succ) const { + return S->succInSameTask(Succ); + } +}; + +//===--------------------------------------------------------------------===// +// GraphTraits specializations for spindle graphs +//===--------------------------------------------------------------------===// + +// Provide specializations of GraphTraits to be able to treat a function +// as a graph of spindles. + +template <> struct GraphTraits { + using NodeRef = Spindle *; + using ChildIteratorType = succ_spindle_iterator; + + static NodeRef getEntryNode(Spindle *S) { return S; } + static ChildIteratorType child_begin(NodeRef N) { return succ_begin(N); } + static ChildIteratorType child_end(NodeRef N) { return succ_end(N); } +}; + +template <> struct GraphTraits { + using NodeRef = const Spindle *; + using ChildIteratorType = succ_spindle_const_iterator; + + static NodeRef getEntryNode(const Spindle *S) { return S; } + static ChildIteratorType child_begin(NodeRef N) { return succ_begin(N); } + static ChildIteratorType child_end(NodeRef N) { return succ_end(N); } +}; + +// Provide specializations of GraphTrais to be able to treat a function as a +// graph of spindles and walk it in inverse order. Inverse order in this case +// is considered to be when traversing the predecessor edges of a spindle +// instead of the successor edges. + +template <> struct GraphTraits> { + using NodeRef = Spindle *; + using ChildIteratorType = pred_spindle_iterator; + + static NodeRef getEntryNode(Inverse G) { return G.Graph; } + static ChildIteratorType child_begin(NodeRef N) { return pred_begin(N); } + static ChildIteratorType child_end(NodeRef N) { return pred_end(N); } +}; + +template <> struct GraphTraits> { + using NodeRef = const Spindle *; + using ChildIteratorType = pred_spindle_const_iterator; + + static NodeRef getEntryNode(Inverse G) { return G.Graph; } + static ChildIteratorType child_begin(NodeRef N) { return pred_begin(N); } + static ChildIteratorType child_end(NodeRef N) { return pred_end(N); } +}; + +// Special type of GraphTrait that uses a filter on the successors of a spindle. +// This GraphTrait is used to build the InTask and UnderTask GraphTraits. + +template +using FilteredSuccessorSpindles = std::pair; + +template +struct GraphTraits> { + using NodeRef = Spindle *; + using ChildIteratorType = filter_iterator; + + static NodeRef getEntryNode(FilteredSuccessorSpindles S) { + return S.first; + } + static ChildIteratorType child_begin(NodeRef N) { + return make_filter_range(successors(N), Filter(N)).begin(); + } + static ChildIteratorType child_end(NodeRef N) { + return make_filter_range(successors(N), Filter(N)).end(); + } +}; + +template +struct GraphTraits> { + using NodeRef = const Spindle *; + using ChildIteratorType = + filter_iterator; + + static NodeRef getEntryNode( + FilteredSuccessorSpindles S) { + return S.first; + } + static ChildIteratorType child_begin(NodeRef N) { + return make_filter_range(successors(N), Filter(N)).begin(); + } + static ChildIteratorType child_end(NodeRef N) { + return make_filter_range(successors(N), Filter(N)).end(); + } +}; + +// Wrapper to allow traversal of only those spindles within a task, excluding +// all subtasks of that task. +template +struct InTask + : public FilteredSuccessorSpindles { + inline InTask(SpindlePtrT S) + : FilteredSuccessorSpindles + (S, InTaskFilter(S)) {} +}; + +template<> struct GraphTraits> : + public GraphTraits> { + using NodeRef = Spindle *; + static NodeRef getEntryNode(InTask G) { + return G.first; + } +}; +template<> struct GraphTraits> : + public GraphTraits> { + using NodeRef = const Spindle *; + static NodeRef getEntryNode(InTask G) { + return G.first; + } +}; + +// Wrapper to traversal of taskframe tree. +template +struct TaskFrames { + const GraphType &Graph; + + inline TaskFrames(const GraphType &G) : Graph(G) {} +}; + +template <> struct GraphTraits> { + using NodeRef = Spindle *; + using ChildIteratorType = Spindle::subtaskframe_iterator; + + static NodeRef getEntryNode(TaskFrames G) { return G.Graph; } + static ChildIteratorType child_begin(NodeRef N) { + return N->subtaskframe_begin(); + } + static ChildIteratorType child_end(NodeRef N) { + return N->subtaskframe_end(); + } +}; + +template <> struct GraphTraits> { + using NodeRef = const Spindle *; + using ChildIteratorType = Spindle::subtaskframe_iterator; + + static NodeRef getEntryNode(TaskFrames G) { return G.Graph; } + static ChildIteratorType child_begin(NodeRef N) { + return N->subtaskframe_begin(); + } + static ChildIteratorType child_end(NodeRef N) { + return N->subtaskframe_end(); + } +}; + +//===----------------------------------------------------------------------===// +/// Instances of this class are used to represent Tapir tasks that are detected +/// in the flow graph. +/// +class Task { + Task *ParentTask; + // Dominator tree + DominatorTree &DomTree; + // Tasks contained entirely within this one. + std::vector SubTasks; + + // List of spindles that make up this task. + std::vector Spindles; + SmallPtrSet DenseSpindleSet; + + // List of shared exception-handling spindles associated with this task. + SmallVector SharedSubTaskEH; + SmallPtrSet DenseEHSpindleSet; + + // Pointers to the continuation and exceptional-continuation spindles for this + // task. + Spindle *Continuation = nullptr; + Spindle *EHContinuation = nullptr; + // The exceptional continuation of the task might not be a landingpad, due to + // transformations on exception-handling code. Hence we keep track of the + // value of landingpad at the exceptional continuation. + Value *LPadValueInEHContinuation = nullptr; + + // Spindle that creates the taskframe this task uses. + Spindle *TaskFrameCreateSpindle = nullptr; + + // Set of taskframe.create spindles that are children of this task. + SmallVector TaskFrameCreates; + + // Set of root taskframe.create spindles that are children of this task. + SmallVector TaskFrameRoots; + + Task(const Task &) = delete; + const Task &operator=(const Task &) = delete; + +public: + /// Return the nesting level of this task. An outer-most task has depth 1, + /// for consistency with task depth values used for basic blocks, where depth + /// 0 is used for blocks not inside any tasks. + unsigned getTaskDepth() const { + unsigned D = 0; + for (const Task *CurTask = ParentTask; CurTask; + CurTask = CurTask->ParentTask) + ++D; + return D; + } + Spindle *getEntrySpindle() const { + return getSpindles().front(); + } + BasicBlock *getEntry() const { + return getEntrySpindle()->getEntry(); + } + Task *getParentTask() const { return ParentTask; } + void setParentTask(Task *T) { ParentTask = T; } + + /// Return true if this task is "serial," meaning it does not itself perform a + /// detach. This method does not preclude functions called by this task from + /// performing a detach. + bool isSerial() const { return SubTasks.empty(); } + + /// Return true if this task is a "root" task, meaning that it has no parent task. + bool isRootTask() const { return nullptr == ParentTask; } + + /// Return true if the analysis found child taskframes of this task. This + /// method assumes that taskframes are in canonical form and that + /// findTaskFrameTree() has run. + bool foundChildTaskFrames() const { return !TaskFrameRoots.empty(); } + + /// Return the detach instruction that created this task, or nullptr if this + /// task is a root task. + DetachInst *getDetach() const { + if (isRootTask()) return nullptr; + BasicBlock *Detacher = getEntry()->getSinglePredecessor(); + assert(Detacher && + "Entry block of non-root task should have a single predecessor"); + assert(isa(Detacher->getTerminator()) && + "Single predecessor of a task should be terminated by a detach"); + return dyn_cast(Detacher->getTerminator()); + } + + /// Get the taskframe that this task uses. + Value *getTaskFrameUsed() const { + // Scan the entry block for a taskframe.use intrinsic. If we find one, + // return its argument. + for (const Instruction &I : *getEntry()) + if (const IntrinsicInst *II = dyn_cast(&I)) + if (Intrinsic::taskframe_use == II->getIntrinsicID()) + return II->getArgOperand(0); + return nullptr; + } + + // Get the spindle that creates the taskframe this task uses. + Spindle *getTaskFrameCreateSpindle() const { return TaskFrameCreateSpindle; } + + /// Get the spindle for the continuation of this task. Returns nullptr if + /// this task is a root task, meaning it has no continuation spindle. + Spindle *getContinuationSpindle() const { + assert(((isRootTask() && !Continuation) || (!isRootTask() && Continuation)) + && "Task should have a continuation spindle iff not a root task."); + return Continuation; + } + + /// Get the spindle for the exceptional continuation o fthis task. Returns + /// nullptr if this task is a root task or the detach for this task does not + /// have an unwind destination. + Spindle *getEHContinuationSpindle() const { + assert(((isRootTask() && !EHContinuation) || + (!isRootTask() && + ((getDetach()->hasUnwindDest() && EHContinuation) || + (!getDetach()->hasUnwindDest() && !EHContinuation)))) && + "Task should have a EH continuation spindle iff not a root task and " + "detach has an unwind destination."); + return EHContinuation; + } + + /// Get the spindle for the exceptional continuation o fthis task. Returns + /// nullptr if this task is a root task or the detach for this task does not + /// have an unwind destination. + Value *getLPadValueInEHContinuationSpindle() const { + assert(((isRootTask() && !LPadValueInEHContinuation) || + (!isRootTask() && + ((getDetach()->hasUnwindDest() && LPadValueInEHContinuation) || + (!getDetach()->hasUnwindDest() && + !LPadValueInEHContinuation)))) && + "Task should have a EH continuation spindle iff not a root task and " + "detach has an unwind destination."); + return LPadValueInEHContinuation; + } + + /// Return true if spindle S is in this task. + bool contains(const Spindle *S) const { + return DenseSpindleSet.count(S); + } + + /// Return true if spindle S is a shared EH spindle dominated by this task. + bool containsSharedEH(const Spindle *S) const { + return DenseEHSpindleSet.count(S); + } + + /// Return true if basic block B is in a shared EH spindle dominated by this + /// task. + bool containsSharedEH(const BasicBlock *B) const { + for (const Spindle *S : SharedSubTaskEH) + if (S->contains(B)) + return true; + return false; + } + + /// Return the tasks contained entirely within this task. + ArrayRef getSubTasks() const { + return SubTasks; + } + std::vector &getSubTasksVector() { + return SubTasks; + } + using iterator = typename std::vector::const_iterator; + using const_iterator = iterator; + using reverse_iterator = + typename std::vector::const_reverse_iterator; + using const_reverse_iterator = reverse_iterator; + inline iterator begin() const { return SubTasks.begin(); } + inline iterator end() const { return SubTasks.end(); } + inline reverse_iterator rbegin() const { return SubTasks.rbegin(); } + inline reverse_iterator rend() const { return SubTasks.rend(); } + inline bool empty() const { return SubTasks.empty(); } + inline iterator_range subtasks() const { + return make_range(begin(), end()); + } + + using tf_iterator = typename SmallVectorImpl::const_iterator; + using tf_const_iterator = tf_iterator; + inline tf_iterator tf_begin() const { return TaskFrameCreates.begin(); } + inline tf_iterator tf_end() const { return TaskFrameCreates.end(); } + inline iterator_range taskframe_creates() const { + return make_range(tf_begin(), tf_end()); + } + inline tf_iterator tf_roots_begin() const { + return TaskFrameRoots.begin(); + } + inline tf_iterator tf_roots_end() const { return TaskFrameRoots.end(); } + inline iterator_range taskframe_roots() const { + return make_range(tf_roots_begin(), tf_roots_end()); + } + + /// Get the number of spindles in this task in constant time. + unsigned getNumSpindles() const { + return Spindles.size(); + } + + /// Return the spindles contained within this task and no subtask. + ArrayRef getSpindles() const { + return Spindles; + } + std::vector &getSpindlesVector() { + return Spindles; + } + SmallPtrSetImpl &getSpindlesSet() { + return DenseSpindleSet; + } + + using spindle_iterator = typename std::vector::const_iterator; + inline spindle_iterator spindle_begin() const { + return Spindles.begin(); + } + inline spindle_iterator spindle_end() const { + return Spindles.end(); + } + inline iterator_range spindles() const { + return make_range(spindle_begin(), spindle_end()); + } + + /// Returns true if this task exits to a shared EH spindle. + bool hasSharedEHExit() const { + if (isRootTask()) return false; + if (!getParentTask()->tracksSharedEHSpindles()) return false; + + for (Spindle *S : getSpindles()) + for (Spindle *Succ : successors(S)) + if (getParentTask()->containsSharedEH(Succ)) + return true; + + return false; + } + + /// Returns true if SharedEH is a shared EH exit of this task. + bool isSharedEHExit(const Spindle *SharedEH) const; + + /// Get the shared EH spindles that this task can exit to and append them to + /// SpindleVec. + void getSharedEHExits(SmallVectorImpl &SpindleVec) const; + + /// Returns true if this task tracks any shared EH spindles for its subtasks. + bool tracksSharedEHSpindles() const { + return !SharedSubTaskEH.empty(); + } + /// Get the number of shared EH spindles in this task in constant time. + unsigned getNumSharedEHSpindles() const { + return SharedSubTaskEH.size(); + } + + /// Return the shared EH spindles contained within this task. + const SmallVectorImpl &getSharedEHSpindles() const { + return SharedSubTaskEH; + } + SmallVectorImpl &getSharedEHSpindles() { + return SharedSubTaskEH; + } + /// Get the shared EH spindle containing basic block B, if it exists. + const Spindle *getSharedEHContaining(const BasicBlock *B) const { + for (const Spindle *S : SharedSubTaskEH) + if (S->contains(B)) + return S; + return nullptr; + } + Spindle *getSharedEHContaining(BasicBlock *B) const { + for (Spindle *S : SharedSubTaskEH) + if (S->contains(B)) + return S; + return nullptr; + } + + using shared_eh_spindle_iterator = + typename SmallVectorImpl::const_iterator; + shared_eh_spindle_iterator shared_eh_spindle_begin() const { + return getSharedEHSpindles().begin(); + } + shared_eh_spindle_iterator shared_eh_spindle_end() const { + return getSharedEHSpindles().end(); + } + inline iterator_range + shared_eh_spindles() const { + return make_range(shared_eh_spindle_begin(), shared_eh_spindle_end()); + } + + /// Get a list of all basic blocks in this task, including blocks in + /// descendant tasks. + void getDominatedBlocks(SmallVectorImpl &Blocks) const { + DomTree.getDescendants(getEntry(), Blocks); + } + + /// Returns true if this task encloses basic block BB simply, that is, without + /// checking any shared EH exits of this task. + bool simplyEncloses(const BasicBlock *BB) const { + return DomTree.dominates(getEntry(), BB); + } + + /// Return true if specified task encloses basic block BB. + bool encloses(const BasicBlock *BB) const { + if (simplyEncloses(BB)) + return true; + if (ParentTask && ParentTask->tracksSharedEHSpindles()) + if (const Spindle *SharedEH = ParentTask->getSharedEHContaining(BB)) + return isSharedEHExit(SharedEH); + return false; + } + + /// Returns either the representative subtask of this task that encloses basic + /// block B or the this task itself if no subtask encloses B. This task must + /// enclose B. + /// + /// These representatives are useful for studying series-parallel + /// relationships between basic blocks in a function when those basic blocks + /// might appear in nested subtasks. + const Task *getSubTaskEnclosing(const BasicBlock *BB) const { + assert(encloses(BB) && "Task does not enclose given BasicBlock"); + for (Task *SubT : subtasks()) + if (SubT->encloses(BB)) + return SubT; + return this; + } + + /// True if terminator in the block can branch to another block that is + /// outside of the current task. + bool isTaskExiting(const BasicBlock *BB) const { + if (BB->getTerminator()->getNumSuccessors() == 0) + return true; + for (const auto *Succ : children(BB)) { + if (isa(Succ->getFirstNonPHIOrDbgOrLifetime())) + continue; + if (!encloses(Succ)) + return true; + } + return false; + } + + /// True if the spindle can exit to a block that is outside of the current + /// task. + bool isTaskExiting(const Spindle *S) const { + for (const BasicBlock *Exit : S->spindle_exits()) + if (isTaskExiting(Exit)) + return true; + return false; + } + + // Returns true if the specified value is defined in the parent of this task. + bool definedInParent(const Value *V) const { + if (isa(V)) return true; + if (const Instruction *I = dyn_cast(V)) + return !encloses(I->getParent()); + return false; + } + + /// Verify task structure + void verify(const TaskInfo *TI, const BasicBlock *Entry, + const DominatorTree &DT) const; + + /// Print task with all the BBs inside it. + void print(raw_ostream &OS, unsigned Depth = 0, bool Verbose = false) const; + + void dump() const; + void dumpVerbose() const; + + /// Raw method to add spindle S to this task. + void addSpindle(Spindle &S) { + Spindles.push_back(&S); + DenseSpindleSet.insert(&S); + } + + /// Raw method to add a shared exception-handling spindle S to this task. + void addEHSpindle(Spindle &S) { + SharedSubTaskEH.push_back(&S); + DenseEHSpindleSet.insert(&S); + } + + // Add task ST as a subtask of this task. + void addSubTask(Task *ST) { + assert(!ST->ParentTask && "SubTask already has a parent task."); + ST->setParentTask(this); + SubTasks.push_back(ST); + } + + // Set Spindle S to be the continuation spindle of this task. + void setContinuationSpindle(Spindle *S) { + assert(!isRootTask() && "Root task cannot have a continuation spindle."); + Continuation = S; + } + + // Set S to be the exceptional continuation spindle of this task. + void setEHContinuationSpindle(Spindle *S, Value *LPadVal) { + assert((!isRootTask() || getDetach()->hasUnwindDest()) && + "Task should not have an exceptional continuation."); + EHContinuation = S; + LPadValueInEHContinuation = LPadVal; + } + +protected: + friend class TaskInfo; + + explicit Task(Spindle &Entry, DominatorTree &DomTree) + : ParentTask(nullptr), DomTree(DomTree) { + Spindles.push_back(&Entry); + DenseSpindleSet.insert(&Entry); + } + + // To allow passes like SCEV to key analysis results off of `Task` pointers, + // we disallow re-use of pointers within a task pass manager. This means task + // passes should not be `delete` ing `Task` objects directly (and risk a later + // `Task` allocation re-using the address of a previous one) but should be + // using TaskInfo::markAsRemoved, which keeps around the `Task` pointer till + // the end of the lifetime of the `TaskInfo` object. + // + // To make it easier to follow this rule, we mark the destructor as + // non-public. + ~Task() { + for (auto *SubTask : SubTasks) + SubTask->~Task(); + + for (auto *Spindle : Spindles) + Spindle->~Spindle(); + + for (auto *SharedEH : SharedSubTaskEH) + SharedEH->~Spindle(); + + SubTasks.clear(); + Spindles.clear(); + SharedSubTaskEH.clear(); + DenseSpindleSet.clear(); + DenseEHSpindleSet.clear(); + ParentTask = nullptr; + Continuation = nullptr; + EHContinuation = nullptr; + LPadValueInEHContinuation = nullptr; + TaskFrameCreateSpindle = nullptr; + TaskFrameCreates.clear(); + TaskFrameRoots.clear(); + } +}; + +raw_ostream &operator<<(raw_ostream &OS, const Task &T); + +//===--------------------------------------------------------------------===// +// GraphTraits specializations for task spindle graphs +//===--------------------------------------------------------------------===// + +// Allow clients to walk the list of nested tasks. +template <> struct GraphTraits { + using NodeRef = const Task *; + using ChildIteratorType = Task::const_iterator; + + static NodeRef getEntryNode(const Task *T) { return T; } + static ChildIteratorType child_begin(NodeRef N) { return N->begin(); } + static ChildIteratorType child_end(NodeRef N) { return N->end(); } +}; + +template <> struct GraphTraits { + using NodeRef = Task *; + using ChildIteratorType = Task::iterator; + + static NodeRef getEntryNode(Task *T) { return T; } + static ChildIteratorType child_begin(NodeRef N) { return N->begin(); } + static ChildIteratorType child_end(NodeRef N) { return N->end(); } +}; + +// Filter for spindle successors in the same task or a subtask. +class UnderTaskFilter { + const Spindle *S = nullptr; +public: + UnderTaskFilter() {} + UnderTaskFilter(const Spindle *S) : S(S) {} + bool operator()(const Spindle *Succ) const { + return S->succInSameTask(Succ) || + (Succ->getParentTask()->getParentTask() == S->getParentTask()); + } +}; + +// Wrapper to allow traversal of only those spindles within a task, including +// all subtasks of that task. +template +struct UnderTask + : public FilteredSuccessorSpindles { + inline UnderTask(SpindlePtrT S) + : FilteredSuccessorSpindles + (S, UnderTaskFilter(S)) {} +}; + +template<> struct GraphTraits> : + public GraphTraits> { + using NodeRef = Spindle *; + static NodeRef getEntryNode(UnderTask G) { + return G.first; + } +}; +template<> struct GraphTraits> : + public GraphTraits> { + using NodeRef = const Spindle *; + static NodeRef getEntryNode(UnderTask G) { + return G.first; + } +}; + +// Structure to record the synced state of each spindle. +struct IsSyncedState { + enum class SyncInfo { + Unsynced = 0, + Synced = 1, + TaskEntry = 2, + NoUnsync = Synced | TaskEntry, + Incomplete = 4, + }; + + static inline bool isUnsynced(const SyncInfo SyncI) { + return (static_cast(SyncI) & static_cast(SyncInfo::NoUnsync)) == + static_cast(SyncInfo::Unsynced); + } + static inline bool isSynced(const SyncInfo SyncI) { + return !isUnsynced(SyncI); + } + static inline bool isIncomplete(const SyncInfo SyncI) { + return (static_cast(SyncI) & static_cast(SyncInfo::Incomplete)) == + static_cast(SyncInfo::Incomplete); + } + static inline SyncInfo setUnsynced(const SyncInfo SyncI) { + // Once a sync state is set to unsynced, it's complete. + return SyncInfo(static_cast(SyncI) & + static_cast(SyncInfo::Unsynced)); + } + static inline SyncInfo setIncomplete(const SyncInfo SyncI) { + return SyncInfo(static_cast(SyncI) | + static_cast(SyncInfo::Incomplete)); + } + static inline SyncInfo setComplete(const SyncInfo SyncI) { + return SyncInfo(static_cast(SyncI) & + ~static_cast(SyncInfo::Incomplete)); + } + + DenseMap SyncedState; + + bool markDefiningSpindle(const Spindle *S); + bool evaluate(const Spindle *S, unsigned EvalNum); +}; + +using MPTaskListTy = DenseMap>; + +// Structure to record the set of child tasks that might be in parallel with +// this spindle. +struct MaybeParallelTasks { + MPTaskListTy TaskList; + + // TODO: Use a bitvector representation to perform the analysis. + + bool markDefiningSpindle(const Spindle *S); + bool evaluate(const Spindle *S, unsigned EvalNum); +}; + +//===----------------------------------------------------------------------===// +/// This class builds and contains all of the top-level task structures in the +/// specified function. +/// +class TaskInfo { + // BBMap - Mapping of basic blocks to the innermost spindle they occur in + DenseMap BBMap; + // SpindleMap - Mapping of spindles to the innermost task they occur in + DenseMap SpindleMap; + // Pointer to the root task for the function. All tasks detached within this + // function body are descendants of this root task. + Task *RootTask = nullptr; + + // Cache storing maybe-parallel-task state. This cache is initialized lazily + // by calls to the mayHappenInParallel method. + mutable std::unique_ptr MPTasks; + + // Flag to indicate whether the taskframe tree has been computed. + mutable bool ComputedTaskFrameTree = false; + + BumpPtrAllocator TaskAllocator; + + void operator=(const TaskInfo &) = delete; + TaskInfo(const TaskInfo &) = delete; + + // Helper for computing the spindles and subtasks contained in all taskframes. + void findTaskFrameTreeHelper(Spindle *TFSpindle, + SmallVectorImpl &ParentWorkList, + SmallPtrSetImpl &SubTFVisited); + +public: + TaskInfo() = default; + ~TaskInfo() { releaseMemory(); } + + TaskInfo(TaskInfo &&Arg) + : BBMap(std::move(Arg.BBMap)), + SpindleMap(std::move(Arg.SpindleMap)), + RootTask(std::move(Arg.RootTask)), + MPTasks(std::move(Arg.MPTasks)), + TaskAllocator(std::move(Arg.TaskAllocator)) { + Arg.RootTask = nullptr; + } + TaskInfo &operator=(TaskInfo &&RHS) { + BBMap = std::move(RHS.BBMap); + SpindleMap = std::move(RHS.SpindleMap); + if (RootTask) + RootTask->~Task(); + RootTask = std::move(RHS.RootTask); + MPTasks = std::move(RHS.MPTasks); + TaskAllocator = std::move(RHS.TaskAllocator); + RHS.RootTask = nullptr; + return *this; + } + + void releaseMemory() { + for (auto BBToSpindle : BBMap) + if (!BBToSpindle.getSecond()->getParentTask()) + BBToSpindle.getSecond()->~Spindle(); + for (auto SpindleToTask : SpindleMap) + if (RootTask != SpindleToTask.getSecond() && + !SpindleToTask.getSecond()->getParentTask()) + SpindleToTask.getSecond()->~Task(); + + BBMap.clear(); + SpindleMap.clear(); + if (RootTask) + RootTask->~Task(); + RootTask = nullptr; + if (MPTasks) { + MPTasks->TaskList.clear(); + MPTasks.reset(); + } + ComputedTaskFrameTree = false; + TaskAllocator.Reset(); + } + + template Spindle *AllocateSpindle(ArgsTy &&... Args) { + Spindle *Storage = TaskAllocator.Allocate(); + return new (Storage) Spindle(std::forward(Args)...); + } + template Task *AllocateTask(ArgsTy &&... Args) { + Task *Storage = TaskAllocator.Allocate(); + return new (Storage) Task(std::forward(Args)...); + } + + Task *getRootTask() const { return RootTask; } + + /// Return true if this function is "serial," meaning it does not itself + /// perform a detach. This method does not preclude functions called by this + /// function from performing a detach. + bool isSerial() const { + assert(getRootTask() && "Null root task\n"); + return getRootTask()->isSerial(); + } + + /// Return true if the analysis found child taskframes of this task. + bool foundChildTaskFrames() const { + assert(getRootTask() && "Null root task\n"); + return getRootTask()->foundChildTaskFrames(); + } + + /// iterator/begin/end - The interface to the top-level tasks in the current + /// function. + /// + using iterator = typename Task::iterator; + using const_iterator = typename Task::const_iterator; + using reverse_iterator = typename Task::reverse_iterator; + using const_reverse_iterator = typename Task::const_reverse_iterator; + inline iterator begin() const { return getRootTask()->begin(); } + inline iterator end() const { return getRootTask()->end(); } + inline reverse_iterator rbegin() const { return getRootTask()->rbegin(); } + inline reverse_iterator rend() const { return getRootTask()->rend(); } + inline bool empty() const { return getRootTask()->empty(); } + + /// Return the innermost spindle that BB lives in. + Spindle *getSpindleFor(const BasicBlock *BB) const { + return BBMap.lookup(BB); + } + + /// Return the innermost task that spindle F lives in. + Task *getTaskFor(const Spindle *S) const { return SpindleMap.lookup(S); } + /// Same as getTaskFor(S). + const Task *operator[](const Spindle *S) const { return getTaskFor(S); } + + /// Return the innermost task that BB lives in. + Task *getTaskFor(const BasicBlock *BB) const { + return getTaskFor(getSpindleFor(BB)); + } + /// Same as getTaskFor(BB). + const Task *operator[](const BasicBlock *BB) const { return getTaskFor(BB); } + + /// Return the taskframe spindle for the given task T. + Spindle *getTaskFrameSpindleFor(const Task *T) const { + Instruction *TaskFrame = + dyn_cast_or_null(T->getTaskFrameUsed()); + if (!TaskFrame) + return nullptr; + return getSpindleFor(TaskFrame->getParent()); + } + + /// Return the innermost task that encompases both basic blocks BB1 and BB2. + Task *getEnclosingTask(const BasicBlock *BB1, const BasicBlock *BB2) const { + return getTaskFor( + getRootTask()->DomTree.findNearestCommonDominator(BB1, BB2)); + } + + /// Return the innermost task that encompases both spindles S1 and S2. + Task *getEnclosingTask(const Spindle *S1, const Spindle *S2) const { + return getEnclosingTask(S1->getEntry(), S2->getEntry()); + } + + /// Return true if task T1 encloses task T2. + bool encloses(const Task *T1, const Task *T2) const { + if (!T1 || !T2) return false; + return getRootTask()->DomTree.dominates(T1->getEntry(), T2->getEntry()); + } + + /// Return true if task T encloses basic block BB. + bool encloses(const Task *T, const BasicBlock *BB) const { + if (!T) return false; + return T->encloses(BB); + } + + /// Return true if the task T encloses instruction Inst. + bool encloses(const Task *T, const Instruction *Inst) const { + return encloses(T, Inst->getParent()); + } + + /// Return the task nesting level of basic block BB. A depth of 0 means the + /// block is in the root task. + unsigned getTaskDepth(const BasicBlock *BB) const { + return getTaskFor(BB)->getTaskDepth(); + } + + /// True if basic block BB is a task entry block + bool isTaskEntry(const BasicBlock *BB) const { + return getTaskFor(BB)->getEntry() == BB; + } + + /// Traverse the graph of spindles to evaluate some parallel state. + template + void evaluateParallelState(StateT &State) const { + SetVector ToProcess; + + // This method performs the work-list algorithm for data-flow analysis on + // spindles. + + // First mark all defining spindles and spindles whose state is eagerly + // updated. + { + // Get the spindles in post order, so we can traverse them in RPO. + SmallVector POSpindles; + for (Spindle *S : post_order(getRootTask()->getEntrySpindle())) + POSpindles.push_back(S); + // SetVector DefSpindles; + for (Spindle *S : llvm::reverse(POSpindles)) + // If we find a defining spindle (or a spindle with an eagerly-updated + // state), add its successors for processing. + if (State.markDefiningSpindle(S)) + for (Spindle *Succ : successors(S)) + ToProcess.insert(Succ); + } + + // Perform the work-list algorithm to propagate data-flow information among + // the spindles. + { + SmallVector NextToProcess; + unsigned EvalNum = 0; + while (!ToProcess.empty()) { + // Process all spindles that need processing. + for (Spindle *Curr : ToProcess) + if (!State.evaluate(Curr, EvalNum)) + // If the state of this spindle changed, add its successors for + // future processing. + for (Spindle *Succ : successors(Curr)) + NextToProcess.push_back(Succ); + + // Get ready to Process the next set of spindles. + ToProcess.clear(); + ToProcess.insert(NextToProcess.begin(), NextToProcess.end()); + NextToProcess.clear(); + ++EvalNum; + } + } + } + + /// Check if a alloca AI is promotable based on task structure. + bool isAllocaParallelPromotable(const AllocaInst *AI) const; + + /// Check if the two basic blocks B1 and B2 may execute in parallel. + bool mayHappenInParallel(const BasicBlock *B1, const BasicBlock *B2) const { + // Common case: No blocks execute in parallel in a serial function. + if (isSerial()) + return false; + + // if (getTaskFor(B1) == getTaskFor(B2)) + // return false; + + // If necessary, compute which tasks may execute in parallel. + if (!MPTasks) { + MPTasks.reset(new MaybeParallelTasks()); + evaluateParallelState(*MPTasks); + } + + // Get the task Encl that encloses both basic blocks. + const Task *Encl = getEnclosingTask(B1, B2); + + // For each basic block, get the representative subtask of Encl that + // encloses that basic block. + const Task *B1Task = Encl->getSubTaskEnclosing(B1); + const Task *B2Task = Encl->getSubTaskEnclosing(B2); + + // Translate these representative tasks into spindles. + const Spindle *B1Spindle = getSpindleFor(B1); + const Spindle *B2Spindle = getSpindleFor(B2); + if (B1Task != Encl) + B1Spindle = getSpindleFor(B1Task->getDetach()->getParent()); + if (B2Task != Encl) + B2Spindle = getSpindleFor(B2Task->getDetach()->getParent()); + + // Evaluate the maybe-parallel task lists for the two representative + // spindles to determine if the blocks may execute in parallel. + return MPTasks->TaskList[B1Spindle].count(B2Task) || + MPTasks->TaskList[B2Spindle].count(B1Task); + } + + /// Create the task forest using a stable algorithm. + void analyze(Function &F, DominatorTree &DomTree); + + /// Compute the spindles and subtasks contained in all taskframes. + void findTaskFrameTree(); + + /// Handle invalidation explicitly. + bool invalidate(Function &F, const PreservedAnalyses &PA, + FunctionAnalysisManager::Invalidator &); + + // Debugging + void print(raw_ostream &OS) const; + + void verify(const DominatorTree &DomTree) const; + + /// Destroy a task that has been removed from the `TaskInfo` nest. + /// + /// This runs the destructor of the task object making it invalid to + /// reference afterward. The memory is retained so that the *pointer* to the + /// task remains valid. + /// + /// The caller is responsible for removing this task from the task nest and + /// otherwise disconnecting it from the broader `TaskInfo` data structures. + /// Callers that don't naturally handle this themselves should probably call + /// `erase' instead. + void destroy(Task *T) { + assert(T && "Cannot destroy a null task."); + T->~Task(); + + // Since TaskAllocator is a BumpPtrAllocator, this Deallocate only poisons + // \c T, but the pointer remains valid for non-dereferencing uses. + TaskAllocator.Deallocate(T); + } + + // Manually recalculate TaskInfo from the given dominator tree. + void recalculate(Function &F, DominatorTree &DomTree) { + releaseMemory(); + analyze(F, DomTree); + } + + // Create a spindle with entry block B and type Ty. + Spindle *createSpindleWithEntry(BasicBlock *B, Spindle::SPType Ty) { + Spindle *S = AllocateSpindle(B, Ty); + assert(!BBMap.count(B) && "BasicBlock already in a spindle!"); + BBMap[B] = S; + return S; + } + + // Create a task with spindle entry S. + Task *createTaskWithEntry(Spindle *S, DominatorTree &DomTree) { + Task *T = AllocateTask(*S, DomTree); + S->setParentTask(T); + assert(!SpindleMap.count(S) && "Spindle already in a task!"); + SpindleMap[S] = T; + return T; + } + + // Add spindle S to task T. + void addSpindleToTask(Spindle *S, Task *T) { + assert(!SpindleMap.count(S) && "Spindle already mapped to a task."); + T->addSpindle(*S); + S->setParentTask(T); + SpindleMap[S] = T; + } + + // Add spindle S to task T, where S is a shared exception-handling spindle + // among subtasks of T. + void addEHSpindleToTask(Spindle *S, Task *T) { + assert(!SpindleMap.count(S) && "Spindle already mapped to a task."); + T->addEHSpindle(*S); + S->setParentTask(T); + SpindleMap[S] = T; + } + + // Add basic block B to spindle S. + void addBlockToSpindle(BasicBlock &B, Spindle *S) { + assert(!BBMap.count(&B) && "Block already mapped to a spindle."); + S->addBlock(B); + BBMap[&B] = S; + } + + // Associate a task T with the spindle TFSpindle that creates its taskframe. + void AssociateTaskFrameWithUser(Task *T, Spindle *TFSpindle) { + TFSpindle->TaskFrameUser = T; + T->TaskFrameCreateSpindle = TFSpindle; + } +}; + +/// Enable verification of Tapir task info. +/// +/// The flag enables checks which are expensive and are disabled by default +/// unless the `EXPENSIVE_CHECKS` macro is defined. The `-verify-task-info` +/// flag allows the checks to be enabled selectively without re-compilation. +extern bool VerifyTaskInfo; + +/// Analysis pass that exposes the \c TaskInfo for a function. +class TaskAnalysis : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; + +public: + using Result = TaskInfo; + + TaskInfo run(Function &F, FunctionAnalysisManager &AM); +}; + +/// Printer pass for the \c TaskAnalysis results. +class TaskPrinterPass : public PassInfoMixin { + raw_ostream &OS; + +public: + explicit TaskPrinterPass(raw_ostream &OS) : OS(OS) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +/// Verifier pass for the \c TaskAnalysis results. +struct TaskVerifierPass : public PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +/// The legacy pass manager's analysis pass to compute task information. +class TaskInfoWrapperPass : public FunctionPass { + TaskInfo TI; + +public: + static char ID; // Pass identification, replacement for typeid + + TaskInfoWrapperPass(); + + TaskInfo &getTaskInfo() { return TI; } + const TaskInfo &getTaskInfo() const { return TI; } + + /// Calculate the natural task information for a given function. + bool runOnFunction(Function &F) override; + + void verifyAnalysis() const override; + + void releaseMemory() override { TI.releaseMemory(); } + + void print(raw_ostream &O, const Module *M = nullptr) const override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +/// Function to print a task's contents as LLVM's text IR assembly. +void printTask(Task &T, raw_ostream &OS, const std::string &Banner = ""); + +/// Examine a given loop to determine if it is structurally a Tapir loop. +/// Returns the Task that encodes the loop body if so, or nullptr if not. +Task *getTaskIfTapirLoopStructure(const Loop *L, TaskInfo *TI); + +} // End llvm namespace + +#endif diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h index db5e80ccdbaaba3..627f1940f0baffb 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h @@ -15,6 +15,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" #include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Tapir/TapirTargetIDs.h" #include namespace llvm { @@ -88,6 +89,8 @@ class TargetLibraryInfoImpl { static StringLiteral const StandardNames[NumLibFuncs]; bool ShouldExtI32Param, ShouldExtI32Return, ShouldSignExtI32Param, ShouldSignExtI32Return; unsigned SizeOfInt; + TapirTargetID TapirTarget = TapirTargetID::Last_TapirTargetID; + std::unique_ptr TTOptions = nullptr; enum AvailabilityState { StandardName = 3, // (memset to all ones) @@ -108,6 +111,9 @@ class TargetLibraryInfoImpl { /// on VectorFnName rather than ScalarFnName. std::vector ScalarDescs; + /// Tapir target standard functions + std::vector TapirTargetFuncs; + /// Return true if the function type FTy is valid for the library function /// F, regardless of whether the function is available. bool isValidProtoForLibFunc(const FunctionType &FTy, LibFunc F, @@ -271,6 +277,47 @@ class TargetLibraryInfoImpl { /// conventions. static bool isCallingConvCCompatible(CallBase *CI); static bool isCallingConvCCompatible(Function *Callee); + + /// Set the target for Tapir lowering. + void setTapirTarget(TapirTargetID TargetID) { + TapirTarget = TargetID; + } + + /// Return the ID of the target for Tapir lowering. + TapirTargetID getTapirTarget() const { + return TapirTarget; + } + + /// Return true if we have a nontrivial target for Tapir lowering. + bool hasTapirTarget() const { + return (TapirTarget != TapirTargetID::Last_TapirTargetID) && + (TapirTarget != TapirTargetID::None); + } + + /// Set options for Tapir lowering. + void setTapirTargetOptions(std::unique_ptr Options) { + std::swap(TTOptions, Options); + } + + /// Return any options for Tapir lowering. + TapirTargetOptions *getTapirTargetOptions() const { + return TTOptions.get(); + } + + /// Records known library functions associated with the specified Tapir + /// target. + void addTapirTargetLibraryFunctions() { + addTapirTargetLibraryFunctions(TapirTarget); + } + void addTapirTargetLibraryFunctions(TapirTargetID TargetID); + + /// Searches for a particular function name among known Tapir-target library + /// functions, also checking that its type is valid for the library function + /// matching that name. + /// + /// Return true if it is one of the known tapir-target library functions. + bool isTapirTargetLibFunc(StringRef funcName) const; + bool isTapirTargetLibFunc(const Function &FDecl) const; }; /// Provides information about what library functions are available for @@ -563,6 +610,29 @@ class TargetLibraryInfo { return Impl->getIntSize(); } + /// \copydoc TargetLibraryInfoImpl::getTapirTarget() + TapirTargetID getTapirTarget() const { + return Impl->getTapirTarget(); + } + + /// \copydoc TargetLibraryInfoImpl::hasTapirTarget() + bool hasTapirTarget() const { + return Impl->hasTapirTarget(); + } + + /// \copydoc TargetLibraryInfoImpl::getTapirTarget() + TapirTargetOptions *getTapirTargetOptions() const { + return Impl->getTapirTargetOptions(); + } + + /// \copydoc TargetLibraryInfoImpl::isTapirTargetLibFunc() + bool isTapirTargetLibFunc(StringRef funcName) const { + return Impl->isTapirTargetLibFunc(funcName); + } + bool isTapirTargetLibFunc(const Function &FDecl) const { + return Impl->isTapirTargetLibFunc(FDecl); + } + /// Handle invalidation from the pass manager. /// /// If we try to invalidate this info, just return false. It cannot become diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 2411b2b31d2936b..f94146d2ed82a89 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -642,6 +642,29 @@ class TargetTransformInfo { TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const; + /// Parameters that control the generic loop stripmining transformation. + struct StripMiningPreferences { + /// A forced stripmining factor (the number of iterations of the original + /// loop in the stripmined inner-loop body). When set to 0, the stripmining + /// transformation will select an stripmining factor based on the current + /// cost threshold and other factors. + unsigned Count; + /// Allow emitting expensive instructions (such as divisions) when computing + /// the trip count of a loop for runtime unrolling. + bool AllowExpensiveTripCount; + /// Default factor for coarsening a task to amortize the cost of creating + /// it. + unsigned DefaultCoarseningFactor; + /// Allow unrolling of all the iterations of the runtime loop remainder. + bool UnrollRemainder; + }; + + /// Get target-customized preferences for the generic Tapir loop stripmining + /// transformation. The caller will initialize SMP with the current + /// target-independent defaults. + void getStripMiningPreferences(Loop *L, ScalarEvolution &, + StripMiningPreferences &SMP) const; + // Parameters that control the loop peeling transformation struct PeelingPreferences { /// A forced peeling factor (the number of bodied of the original loop @@ -1870,6 +1893,8 @@ class TargetTransformInfo::Concept { virtual bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) = 0; virtual TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) = 0; + virtual void getStripMiningPreferences(Loop *L, ScalarEvolution &, + StripMiningPreferences &SMP) = 0; virtual std::optional instCombineIntrinsic( InstCombiner &IC, IntrinsicInst &II) = 0; virtual std::optional simplifyDemandedUseBitsIntrinsic( @@ -2328,6 +2353,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) override { return Impl.getPreferredTailFoldingStyle(IVUpdateMayOverflow); } + void getStripMiningPreferences(Loop *L, ScalarEvolution &SE, + StripMiningPreferences &SMP) override { + return Impl.getStripMiningPreferences(L, SE, SMP); + } std::optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) override { return Impl.instCombineIntrinsic(IC, II); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 00efa474a91b57e..a4de9ab49949575 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -220,6 +220,9 @@ class TargetTransformInfoImplBase { void getPeelingPreferences(Loop *, ScalarEvolution &, TTI::PeelingPreferences &) const {} + void getStripMiningPreferences(Loop *, ScalarEvolution &, + TTI::StripMiningPreferences &) const {} + bool isLegalAddImmediate(int64_t Imm) const { return false; } bool isLegalAddScalableImmediate(int64_t Imm) const { return false; } @@ -769,6 +772,12 @@ class TargetTransformInfoImplBase { case Intrinsic::threadlocal_address: case Intrinsic::experimental_widenable_condition: case Intrinsic::ssa_copy: + case Intrinsic::syncregion_start: + case Intrinsic::taskframe_create: + case Intrinsic::taskframe_use: + case Intrinsic::taskframe_end: + case Intrinsic::taskframe_load_guard: + case Intrinsic::sync_unwind: // These intrinsics don't actually represent code after lowering. return 0; } @@ -1468,6 +1477,11 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { Type *DstTy = Operands[0]->getType(); return TargetTTI->getVectorInstrCost(*EEI, DstTy, CostKind, Idx); } + case Instruction::Detach: + // Ideally, we'd determine the number of arguments of the detached task. + // But because that computation is expensive, we settle for 30x the basic + // cost of a function call. + return 30 * TTI::TCC_Basic; } // By default, just classify everything as 'basic' or -1 to represent that diff --git a/llvm/include/llvm/Analysis/WorkSpanAnalysis.h b/llvm/include/llvm/Analysis/WorkSpanAnalysis.h new file mode 100644 index 000000000000000..3763917bf4bad2a --- /dev/null +++ b/llvm/include/llvm/Analysis/WorkSpanAnalysis.h @@ -0,0 +1,57 @@ +//===- WorkSpanAnalysis.h - Analysis to estimate work and span --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements an analysis pass to estimate the work and span of the +// program. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_WORKSPANANALYSIS_H_ +#define LLVM_ANALYSIS_WORKSPANANALYSIS_H_ + +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Support/InstructionCost.h" + +// TODO: Build a CGSCC pass based on these analyses to efficiently estimate the +// work and span of all the functions in a module. + +// TODO: Use BlockFrequencyInfo to improve how this analysis evaluates code with +// control flow. Specifically, the analysis should weight the work and span of +// a block based on the probabilities of its incoming edges, with special care +// given to detach, reattach, and continue edges. + +// TODO: Connect these analyses with a scalability profiler to implement PGO for +// Tapir. + +namespace llvm { +class Loop; +class LoopInfo; +class ScalarEvolution; +class TargetLibraryInfo; +class TargetTransformInfo; + +struct WSCost { + InstructionCost Work = 0; + InstructionCost Span = 0; + + bool UnknownCost = false; + + CodeMetrics Metrics; +}; + +// Get a constant trip count for the given loop. +unsigned getConstTripCount(const Loop *L, ScalarEvolution &SE); + +void estimateLoopCost(WSCost &LoopCost, const Loop *L, LoopInfo *LI, + ScalarEvolution *SE, const TargetTransformInfo &TTI, + TargetLibraryInfo *TLI, + const SmallPtrSetImpl &EphValues); +} + +#endif // LLVM_ANALYSIS_WORKSPANANALYSIS_H_ diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h index e381295802009a0..897bff3ada57279 100644 --- a/llvm/include/llvm/AsmParser/LLParser.h +++ b/llvm/include/llvm/AsmParser/LLParser.h @@ -640,6 +640,9 @@ namespace llvm { bool parseCatchPad(Instruction *&Inst, PerFunctionState &PFS); bool parseCleanupPad(Instruction *&Inst, PerFunctionState &PFS); bool parseCallBr(Instruction *&Inst, PerFunctionState &PFS); + bool parseDetach(Instruction *&Inst, PerFunctionState &PFS); + bool parseReattach(Instruction *&Inst, PerFunctionState &PFS); + bool parseSync(Instruction *&Inst, PerFunctionState &PFS); bool parseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc, bool IsFP); diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h index db6780b70ca5aac..7c295535d394bbb 100644 --- a/llvm/include/llvm/AsmParser/LLToken.h +++ b/llvm/include/llvm/AsmParser/LLToken.h @@ -351,6 +351,12 @@ enum Kind { kw_freeze, + // Tapir types + kw_detach, + kw_reattach, + // NOTE: kw_sync is already defined for a different context. + // kw_tsync, + // Metadata types. kw_distinct, diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index fb88f2fe75adb51..40ae85cf16919e1 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -654,6 +654,10 @@ enum FunctionCodes { FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE = 64, // [DILocation, DILocalVariable, DIExpression, Value] FUNC_CODE_DEBUG_RECORD_LABEL = 65, // [DILocation, DILabel] + + FUNC_CODE_INST_DETACH = 66, // DETACH: [bb#,bb#] or [bb#,bb#,bb#] + FUNC_CODE_INST_REATTACH = 67, // REATTACH: [bb#] + FUNC_CODE_INST_SYNC = 68, // SYNC: [bb#] }; enum UseListCodes { @@ -758,6 +762,8 @@ enum AttributeKindCodes { ATTR_KIND_SANITIZE_NUMERICAL_STABILITY = 93, ATTR_KIND_INITIALIZES = 94, ATTR_KIND_HYBRID_PATCHABLE = 95, + ATTR_KIND_SANITIZE_CILK = 96, + ATTR_KIND_STEALABLE = 97, }; enum ComdatSelectionKindCodes { diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h index deae2c55d26e274..1d7ce162d81bdec 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h @@ -436,6 +436,12 @@ class IRTranslator : public MachineFunctionPass { bool translateIndirectBr(const User &U, MachineIRBuilder &MIRBuilder); + bool translateDetach(const User &U, MachineIRBuilder &MIRBuilder); + + bool translateReattach(const User &U, MachineIRBuilder &MIRBuilder); + + bool translateSync(const User &U, MachineIRBuilder &MIRBuilder); + bool translateExtractValue(const User &U, MachineIRBuilder &MIRBuilder); bool translateInsertValue(const User &U, MachineIRBuilder &MIRBuilder); diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h index 304db57eca49942..ea74c2363c3b4b2 100644 --- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h +++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h @@ -721,6 +721,7 @@ struct MachineFunction { StringRef Name; MaybeAlign Alignment = std::nullopt; bool ExposesReturnsTwice = false; + bool ExposesOpaqueReturnsTwice = false; // GISel MachineFunctionProperties. bool Legalized = false; bool RegBankSelected = false; @@ -763,6 +764,7 @@ template <> struct MappingTraits { YamlIO.mapRequired("name", MF.Name); YamlIO.mapOptional("alignment", MF.Alignment, std::nullopt); YamlIO.mapOptional("exposesReturnsTwice", MF.ExposesReturnsTwice, false); + YamlIO.mapOptional("exposesOpaqueReturnsTwice", MF.ExposesOpaqueReturnsTwice, false); YamlIO.mapOptional("legalized", MF.Legalized, false); YamlIO.mapOptional("regBankSelected", MF.RegBankSelected, false); YamlIO.mapOptional("selected", MF.Selected, false); diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h index 213b7ec6b3fbfbb..1766de48c0d354b 100644 --- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h +++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h @@ -355,6 +355,7 @@ class MachineFrameInfo { /// selection is complete to determine if the stack frame for this function /// contains any variable sized objects. bool hasVarSizedObjects() const { return HasVarSizedObjects; } + void setHasVarSizedObjects(bool v = true) { HasVarSizedObjects = v; } /// Return the index for the stack protector object. int getStackProtectorIndex() const { return StackProtectorIdx; } diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h index 6e7292abeddbbdf..7d8928f15d159e3 100644 --- a/llvm/include/llvm/CodeGen/MachineFunction.h +++ b/llvm/include/llvm/CodeGen/MachineFunction.h @@ -326,6 +326,13 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction { /// about the control flow of such functions. bool ExposesReturnsTwice = false; + /// ExposesOpaqueReturnsTwice - True if the function calls setjmp or related + /// functions with attribute "returns twice", other than LLVM's builtin + /// setjmp, but doesn't have the attribute itself. + /// This is used to limit optimizations which cannot reason + /// about the control flow of such functions. + bool ExposesOpaqueReturnsTwice = false; + /// True if the function includes any inline assembly. bool HasInlineAsm = false; @@ -786,6 +793,19 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction { ExposesReturnsTwice = B; } + /// exposesReturnsTwice - Returns true if the function calls a function with + /// attribute "returns twice" other than LLVM's builtin setjmp without having + /// the attribute itself. + bool exposesOpaqueReturnsTwice() const { + return ExposesOpaqueReturnsTwice; + } + + /// setExposesOpaqueReturnsTwice - Set a flag that indicates if there's a call + /// to a "returns twice" function other than LLVM's builtin setjmp. + void setExposesOpaqueReturnsTwice(bool B) { + ExposesOpaqueReturnsTwice = B; + } + /// Returns true if the function contains any inline assembly. bool hasInlineAsm() const { return HasInlineAsm; diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index cafb9781698a2e3..e84a5af32143a39 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -579,6 +579,11 @@ namespace llvm { /// Creates MIR Check Debug pass. \see MachineCheckDebugify.cpp ModulePass *createCheckDebugMachineModulePass(); +/// Clean up any remaining Tapir instructions. Typically, this pass should + /// have no effect, because Tapir instructions should have been lowered + /// already to a particular parallel runtime. + FunctionPass *createTapirCleanupPass(); + /// The pass fixups statepoint machine instruction to replace usage of /// caller saved registers with stack slots. extern char &FixupStatepointCallerSavedID; diff --git a/llvm/include/llvm/CodeGen/TailDuplicator.h b/llvm/include/llvm/CodeGen/TailDuplicator.h index 8b1f67c416c2242..646dc83d2a832f6 100644 --- a/llvm/include/llvm/CodeGen/TailDuplicator.h +++ b/llvm/include/llvm/CodeGen/TailDuplicator.h @@ -75,7 +75,8 @@ class TailDuplicator { bool tailDuplicateBlocks(); static bool isSimpleBB(MachineBasicBlock *TailBB); - bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB); + BlockDesc getBlockDesc(MachineBasicBlock *TailBB); + bool shouldTailDuplicate(BlockDesc const &Desc, MachineBasicBlock &TailBB); /// Returns true if TailBB can successfully be duplicated into PredBB bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB); @@ -88,7 +89,7 @@ class TailDuplicator { /// deleted. /// If \p CandidatePtr is not null, duplicate into these blocks only. bool tailDuplicateAndUpdate( - bool IsSimple, MachineBasicBlock *MBB, + const BlockDesc &Desc, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl *DuplicatedPreds = nullptr, function_ref *RemovalCallback = nullptr, @@ -115,7 +116,7 @@ class TailDuplicator { bool duplicateSimpleBB(MachineBasicBlock *TailBB, SmallVectorImpl &TDBBs, const DenseSet &RegsUsedByPhi); - bool tailDuplicate(bool IsSimple, + bool tailDuplicate(const BlockDesc &Desc, MachineBasicBlock *TailBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index 649711d8faf650a..61f098d870278cc 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -104,6 +104,21 @@ struct ExtAddrMode { ExtAddrMode() = default; }; +struct BlockBRNZ { + // If true, the registers below are dead + bool IsKill = false; + // The register (or set of registers feeding into a PHI) that + // is tested against zero to determine the branch. + SmallVector Regs; + MachineBasicBlock *Zero = nullptr; // Target if register is zero + MachineBasicBlock *Nonzero = nullptr; // Target if register is not zero +}; + +struct BlockDesc { + bool IsSimple = false; + std::optional BRNZ; +}; + //--------------------------------------------------------------------------- /// /// TargetInstrInfo - Interface to description of machine instruction set @@ -705,6 +720,13 @@ class TargetInstrInfo : public MCInstrInfo { llvm_unreachable("Target didn't implement TargetInstrInfo::removeBranch!"); } + /// Remove the branches at the end of the block and any compare + /// instructions used only by the branches. + virtual unsigned removeBranchAndFlags(MachineBasicBlock &MBB, + int *BytesRemoved = nullptr) const { + return removeBranch(MBB, BytesRemoved); + } + /// Insert branch code into the end of the specified MachineBasicBlock. The /// operands to this method are the same as those returned by analyzeBranch. /// This is only invoked in cases where analyzeBranch returns success. It @@ -1726,6 +1748,19 @@ class TargetInstrInfo : public MCInstrInfo { } virtual bool optimizeCondBranch(MachineInstr &MI) const { return false; } + /// Return a descriptor if this block branches depending on whether a register + /// is nonzero. + virtual std::optional isZeroTest(MachineBasicBlock &MBB) const { + return std::optional(); + } + + /// If this instruction sets a register to a constant integer value, + /// return true, the register, and the value. + virtual bool isSetConstant(const MachineInstr &MI, Register &Reg, + int64_t &Value) const { + return false; + } + /// Try to remove the load by folding it to a register operand at the use. /// We fold the load instructions if and only if the /// def and use are in the same BB. We only look at one load and see diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake index 629977cc11d6836..03df2abd3863f66 100644 --- a/llvm/include/llvm/Config/llvm-config.h.cmake +++ b/llvm/include/llvm/Config/llvm-config.h.cmake @@ -153,6 +153,15 @@ /* LLVM version string */ #define LLVM_VERSION_STRING "${PACKAGE_VERSION}" +/* Major version of the Tapir API */ +#define TAPIR_VERSION_MAJOR ${TAPIR_VERSION_MAJOR} + +/* Minor version of the Tapir API */ +#define TAPIR_VERSION_MINOR ${TAPIR_VERSION_MINOR} + +/* Patch version of the Tapir API */ +#define TAPIR_VERSION_PATCH ${TAPIR_VERSION_PATCH} + /* Whether LLVM records statistics for use with GetStatistics(), * PrintStatistics() or PrintStatisticsJSON() */ diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td index e1bd193891c1e1d..bea25b9ed4191f5 100644 --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -306,6 +306,14 @@ def SanitizeNumericalStability : EnumAttr<"sanitize_numerical_stability", [FnAtt def SpeculativeLoadHardening : EnumAttr<"speculative_load_hardening", [FnAttr]>; +/// CilkSanitizer is on. +def SanitizeCilk : EnumAttr<"sanitize_cilk", [FnAttr]>; + +/// From the Cilk perspective, a continuation in the function can be +/// stolen. This attribute is used ensure correct code generation for +/// such functions. +def Stealable : EnumAttr<"stealable", [FnAttr]>; + /// Argument is swift error. def SwiftError : EnumAttr<"swifterror", [ParamAttr]>; @@ -414,6 +422,7 @@ def : MergeRule<"setOR">; def : MergeRule<"setOR">; def : MergeRule<"setOR">; def : MergeRule<"setOR">; +def : MergeRule<"setOR">; def : MergeRule<"adjustCallerSSPLevel">; def : MergeRule<"adjustCallerStackProbes">; def : MergeRule<"adjustCallerStackProbeSize">; diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h index 12571d957da6093..e7a06a1124bc0df 100644 --- a/llvm/include/llvm/IR/BasicBlock.h +++ b/llvm/include/llvm/IR/BasicBlock.h @@ -306,6 +306,16 @@ class BasicBlock final : public Value, // Basic blocks are data objects also SkipPseudoOp)); } + /// Returns a pointer to the first instruction in this block that is not a + /// PHINode, a debug intrinsic, or a sync.unwind intrinsic. + const Instruction * + getFirstNonPHIOrDbgOrSyncUnwind(bool SkipPseudoOp = false) const; + Instruction *getFirstNonPHIOrDbgOrSyncUnwind(bool SkipPseudoOp = false) { + return const_cast( + static_cast(this)->getFirstNonPHIOrDbgOrSyncUnwind( + SkipPseudoOp)); + } + /// Returns an iterator to the first instruction in this block that is /// suitable for inserting a non-PHI instruction. /// diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h index 01f76d493278083..7997d0c451e468d 100644 --- a/llvm/include/llvm/IR/DerivedTypes.h +++ b/llvm/include/llvm/IR/DerivedTypes.h @@ -260,6 +260,10 @@ class StructType : public Type { /// Create an empty structure type. static StructType *get(LLVMContext &Context, bool isPacked = false); + /// Try to lookup a structure type by name, and create one if one does not + /// exist. + static StructType *lookupOrCreate(LLVMContext &Context, StringRef Name); + /// This static method is a convenience method for creating structure types by /// specifying the elements as arguments. Note that this method always returns /// a non-packed struct, and requires at least one element type. diff --git a/llvm/include/llvm/IR/EHPersonalities.h b/llvm/include/llvm/IR/EHPersonalities.h index c70f832de40b409..e2c0a08d9ee514c 100644 --- a/llvm/include/llvm/IR/EHPersonalities.h +++ b/llvm/include/llvm/IR/EHPersonalities.h @@ -34,6 +34,7 @@ enum class EHPersonality { Wasm_CXX, XL_CXX, ZOS_CXX, + Cilk_CXX, }; /// See if the given exception handling personality function is one diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index 31a1fef32199506..793df611d213e19 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -1269,6 +1269,37 @@ class IRBuilderBase { return Insert(new UnreachableInst(Context)); } + /// \brief Create a detach instruction, + /// 'detach within SyncRegion, Detached, Continue'. + DetachInst *CreateDetach(BasicBlock *Detached, BasicBlock *Continue, + Value *SyncRegion, MDNode *BranchWeights = nullptr) { + return Insert(addBranchMetadata( + DetachInst::Create(Detached, Continue, SyncRegion), + BranchWeights, nullptr)); + } + + /// \brief Create a detach instruction, + /// 'detach within SyncRegion, Detached, Continue, Unwind'. + DetachInst *CreateDetach(BasicBlock *Detached, BasicBlock *Continue, + BasicBlock *Unwind, Value *SyncRegion, + MDNode *BranchWeights = nullptr) { + return Insert(addBranchMetadata( + DetachInst::Create(Detached, Continue, Unwind, + SyncRegion), + BranchWeights, nullptr)); + } + + /// \brief Create a reattach instruction, 'reattach within SyncRegion, + /// DetachContinue'. + ReattachInst *CreateReattach(BasicBlock *DetachContinue, Value *SyncRegion) { + return Insert(ReattachInst::Create(DetachContinue, SyncRegion)); + } + + /// \brief Create a sync instruction, 'sync within SyncRegion, Continue'. + SyncInst *CreateSync(BasicBlock *Continue, Value *SyncRegion) { + return Insert(SyncInst::Create(Continue, SyncRegion)); + } + //===--------------------------------------------------------------------===// // Instruction creation methods: Binary Operators //===--------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h index 311e0ac47ddfadd..16be5682661ea68 100644 --- a/llvm/include/llvm/IR/InstVisitor.h +++ b/llvm/include/llvm/IR/InstVisitor.h @@ -250,6 +250,15 @@ class InstVisitor { RetTy visitCatchSwitchInst(CatchSwitchInst &I) { return static_cast(this)->visitTerminator(I); } + RetTy visitDetachInst(DetachInst &I) { + return static_cast(this)->visitTerminator(I); + } + RetTy visitReattachInst(ReattachInst &I) { + return static_cast(this)->visitTerminator(I); + } + RetTy visitSyncInst(SyncInst &I) { + return static_cast(this)->visitTerminator(I); + } RetTy visitTerminator(Instruction &I) { DELEGATE(Instruction);} // Next level propagators: If the user does not overload a specific diff --git a/llvm/include/llvm/IR/Instruction.def b/llvm/include/llvm/IR/Instruction.def index a5ad92f58f94e3c..1a594807c4b2297 100644 --- a/llvm/include/llvm/IR/Instruction.def +++ b/llvm/include/llvm/IR/Instruction.def @@ -135,90 +135,93 @@ HANDLE_TERM_INST ( 8, CleanupRet , CleanupReturnInst) HANDLE_TERM_INST ( 9, CatchRet , CatchReturnInst) HANDLE_TERM_INST (10, CatchSwitch , CatchSwitchInst) HANDLE_TERM_INST (11, CallBr , CallBrInst) // A call-site terminator - LAST_TERM_INST (11) +HANDLE_TERM_INST (12, Detach , DetachInst) +HANDLE_TERM_INST (13, Reattach , ReattachInst) +HANDLE_TERM_INST (14, Sync , SyncInst) + LAST_TERM_INST (14) // Standard unary operators... - FIRST_UNARY_INST(12) -HANDLE_UNARY_INST(12, FNeg , UnaryOperator) - LAST_UNARY_INST(12) + FIRST_UNARY_INST(15) +HANDLE_UNARY_INST(15, FNeg , UnaryOperator) + LAST_UNARY_INST(15) // Standard binary operators... - FIRST_BINARY_INST(13) -HANDLE_BINARY_INST(13, Add , BinaryOperator) -HANDLE_BINARY_INST(14, FAdd , BinaryOperator) -HANDLE_BINARY_INST(15, Sub , BinaryOperator) -HANDLE_BINARY_INST(16, FSub , BinaryOperator) -HANDLE_BINARY_INST(17, Mul , BinaryOperator) -HANDLE_BINARY_INST(18, FMul , BinaryOperator) -HANDLE_BINARY_INST(19, UDiv , BinaryOperator) -HANDLE_BINARY_INST(20, SDiv , BinaryOperator) -HANDLE_BINARY_INST(21, FDiv , BinaryOperator) -HANDLE_BINARY_INST(22, URem , BinaryOperator) -HANDLE_BINARY_INST(23, SRem , BinaryOperator) -HANDLE_BINARY_INST(24, FRem , BinaryOperator) + FIRST_BINARY_INST(16) +HANDLE_BINARY_INST(16, Add , BinaryOperator) +HANDLE_BINARY_INST(17, FAdd , BinaryOperator) +HANDLE_BINARY_INST(18, Sub , BinaryOperator) +HANDLE_BINARY_INST(19, FSub , BinaryOperator) +HANDLE_BINARY_INST(20, Mul , BinaryOperator) +HANDLE_BINARY_INST(21, FMul , BinaryOperator) +HANDLE_BINARY_INST(22, UDiv , BinaryOperator) +HANDLE_BINARY_INST(23, SDiv , BinaryOperator) +HANDLE_BINARY_INST(24, FDiv , BinaryOperator) +HANDLE_BINARY_INST(25, URem , BinaryOperator) +HANDLE_BINARY_INST(26, SRem , BinaryOperator) +HANDLE_BINARY_INST(27, FRem , BinaryOperator) // Logical operators (integer operands) -HANDLE_BINARY_INST(25, Shl , BinaryOperator) // Shift left (logical) -HANDLE_BINARY_INST(26, LShr , BinaryOperator) // Shift right (logical) -HANDLE_BINARY_INST(27, AShr , BinaryOperator) // Shift right (arithmetic) -HANDLE_BINARY_INST(28, And , BinaryOperator) -HANDLE_BINARY_INST(29, Or , BinaryOperator) -HANDLE_BINARY_INST(30, Xor , BinaryOperator) - LAST_BINARY_INST(30) +HANDLE_BINARY_INST(28, Shl , BinaryOperator) // Shift left (logical) +HANDLE_BINARY_INST(29, LShr , BinaryOperator) // Shift right (logical) +HANDLE_BINARY_INST(30, AShr , BinaryOperator) // Shift right (arithmetic) +HANDLE_BINARY_INST(31, And , BinaryOperator) +HANDLE_BINARY_INST(32, Or , BinaryOperator) +HANDLE_BINARY_INST(33, Xor , BinaryOperator) + LAST_BINARY_INST(33) // Memory operators... - FIRST_MEMORY_INST(31) -HANDLE_MEMORY_INST(31, Alloca, AllocaInst) // Stack management -HANDLE_MEMORY_INST(32, Load , LoadInst ) // Memory manipulation instrs -HANDLE_MEMORY_INST(33, Store , StoreInst ) -HANDLE_MEMORY_INST(34, GetElementPtr, GetElementPtrInst) -HANDLE_MEMORY_INST(35, Fence , FenceInst ) -HANDLE_MEMORY_INST(36, AtomicCmpXchg , AtomicCmpXchgInst ) -HANDLE_MEMORY_INST(37, AtomicRMW , AtomicRMWInst ) - LAST_MEMORY_INST(37) + FIRST_MEMORY_INST(34) +HANDLE_MEMORY_INST(34, Alloca, AllocaInst) // Stack management +HANDLE_MEMORY_INST(35, Load , LoadInst ) // Memory manipulation instrs +HANDLE_MEMORY_INST(36, Store , StoreInst ) +HANDLE_MEMORY_INST(37, GetElementPtr, GetElementPtrInst) +HANDLE_MEMORY_INST(38, Fence , FenceInst ) +HANDLE_MEMORY_INST(39, AtomicCmpXchg , AtomicCmpXchgInst ) +HANDLE_MEMORY_INST(40, AtomicRMW , AtomicRMWInst ) + LAST_MEMORY_INST(40) // Cast operators ... // NOTE: The order matters here because CastInst::isEliminableCastPair // NOTE: (see Instructions.cpp) encodes a table based on this ordering. - FIRST_CAST_INST(38) -HANDLE_CAST_INST(38, Trunc , TruncInst ) // Truncate integers -HANDLE_CAST_INST(39, ZExt , ZExtInst ) // Zero extend integers -HANDLE_CAST_INST(40, SExt , SExtInst ) // Sign extend integers -HANDLE_CAST_INST(41, FPToUI , FPToUIInst ) // floating point -> UInt -HANDLE_CAST_INST(42, FPToSI , FPToSIInst ) // floating point -> SInt -HANDLE_CAST_INST(43, UIToFP , UIToFPInst ) // UInt -> floating point -HANDLE_CAST_INST(44, SIToFP , SIToFPInst ) // SInt -> floating point -HANDLE_CAST_INST(45, FPTrunc , FPTruncInst ) // Truncate floating point -HANDLE_CAST_INST(46, FPExt , FPExtInst ) // Extend floating point -HANDLE_CAST_INST(47, PtrToInt, PtrToIntInst) // Pointer -> Integer -HANDLE_CAST_INST(48, IntToPtr, IntToPtrInst) // Integer -> Pointer -HANDLE_CAST_INST(49, BitCast , BitCastInst ) // Type cast -HANDLE_CAST_INST(50, AddrSpaceCast, AddrSpaceCastInst) // addrspace cast - LAST_CAST_INST(50) - - FIRST_FUNCLETPAD_INST(51) -HANDLE_FUNCLETPAD_INST(51, CleanupPad, CleanupPadInst) -HANDLE_FUNCLETPAD_INST(52, CatchPad , CatchPadInst) - LAST_FUNCLETPAD_INST(52) + FIRST_CAST_INST(41) +HANDLE_CAST_INST(41, Trunc , TruncInst ) // Truncate integers +HANDLE_CAST_INST(42, ZExt , ZExtInst ) // Zero extend integers +HANDLE_CAST_INST(43, SExt , SExtInst ) // Sign extend integers +HANDLE_CAST_INST(44, FPToUI , FPToUIInst ) // floating point -> UInt +HANDLE_CAST_INST(45, FPToSI , FPToSIInst ) // floating point -> SInt +HANDLE_CAST_INST(46, UIToFP , UIToFPInst ) // UInt -> floating point +HANDLE_CAST_INST(47, SIToFP , SIToFPInst ) // SInt -> floating point +HANDLE_CAST_INST(48, FPTrunc , FPTruncInst ) // Truncate floating point +HANDLE_CAST_INST(49, FPExt , FPExtInst ) // Extend floating point +HANDLE_CAST_INST(50, PtrToInt, PtrToIntInst) // Pointer -> Integer +HANDLE_CAST_INST(51, IntToPtr, IntToPtrInst) // Integer -> Pointer +HANDLE_CAST_INST(52, BitCast , BitCastInst ) // Type cast +HANDLE_CAST_INST(53, AddrSpaceCast, AddrSpaceCastInst) // addrspace cast + LAST_CAST_INST(53) + + FIRST_FUNCLETPAD_INST(54) +HANDLE_FUNCLETPAD_INST(54, CleanupPad, CleanupPadInst) +HANDLE_FUNCLETPAD_INST(55, CatchPad , CatchPadInst) + LAST_FUNCLETPAD_INST(55) // Other operators... - FIRST_OTHER_INST(53) -HANDLE_OTHER_INST(53, ICmp , ICmpInst ) // Integer comparison instruction -HANDLE_OTHER_INST(54, FCmp , FCmpInst ) // Floating point comparison instr. -HANDLE_OTHER_INST(55, PHI , PHINode ) // PHI node instruction -HANDLE_OTHER_INST(56, Call , CallInst ) // Call a function -HANDLE_OTHER_INST(57, Select , SelectInst ) // select instruction -HANDLE_USER_INST (58, UserOp1, Instruction) // May be used internally in a pass -HANDLE_USER_INST (59, UserOp2, Instruction) // Internal to passes only -HANDLE_OTHER_INST(60, VAArg , VAArgInst ) // vaarg instruction -HANDLE_OTHER_INST(61, ExtractElement, ExtractElementInst)// extract from vector -HANDLE_OTHER_INST(62, InsertElement, InsertElementInst) // insert into vector -HANDLE_OTHER_INST(63, ShuffleVector, ShuffleVectorInst) // shuffle two vectors. -HANDLE_OTHER_INST(64, ExtractValue, ExtractValueInst)// extract from aggregate -HANDLE_OTHER_INST(65, InsertValue, InsertValueInst) // insert into aggregate -HANDLE_OTHER_INST(66, LandingPad, LandingPadInst) // Landing pad instruction. -HANDLE_OTHER_INST(67, Freeze, FreezeInst) // Freeze instruction. - LAST_OTHER_INST(67) + FIRST_OTHER_INST(56) +HANDLE_OTHER_INST(56, ICmp , ICmpInst ) // Integer comparison instruction +HANDLE_OTHER_INST(57, FCmp , FCmpInst ) // Floating point comparison instr. +HANDLE_OTHER_INST(58, PHI , PHINode ) // PHI node instruction +HANDLE_OTHER_INST(59, Call , CallInst ) // Call a function +HANDLE_OTHER_INST(60, Select , SelectInst ) // select instruction +HANDLE_USER_INST (61, UserOp1, Instruction) // May be used internally in a pass +HANDLE_USER_INST (62, UserOp2, Instruction) // Internal to passes only +HANDLE_OTHER_INST(63, VAArg , VAArgInst ) // vaarg instruction +HANDLE_OTHER_INST(64, ExtractElement, ExtractElementInst)// extract from vector +HANDLE_OTHER_INST(65, InsertElement, InsertElementInst) // insert into vector +HANDLE_OTHER_INST(66, ShuffleVector, ShuffleVectorInst) // shuffle two vectors. +HANDLE_OTHER_INST(67, ExtractValue, ExtractValueInst)// extract from aggregate +HANDLE_OTHER_INST(68, InsertValue, InsertValueInst) // insert into aggregate +HANDLE_OTHER_INST(69, LandingPad, LandingPadInst) // Landing pad instruction. +HANDLE_OTHER_INST(70, Freeze, FreezeInst) // Freeze instruction. + LAST_OTHER_INST(70) #undef FIRST_TERM_INST #undef HANDLE_TERM_INST diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h index c27572300d50630..dbcbd42e5ac3ec4 100644 --- a/llvm/include/llvm/IR/Instruction.h +++ b/llvm/include/llvm/IR/Instruction.h @@ -787,6 +787,7 @@ class Instruction : public User, // This list should be kept in sync with the list in mayWriteToMemory for // all opcodes which don't have a memory location. case Instruction::Fence: + case Instruction::Sync: // Like Instruction::Fence case Instruction::CatchPad: case Instruction::CatchRet: case Instruction::Call: @@ -844,6 +845,9 @@ class Instruction : public User, /// Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst. bool isDebugOrPseudoInst() const LLVM_READONLY; + /// Return true if the instruction is a llvm.taskframe marker. + bool isTaskFrameMarker() const; + /// Return a pointer to the next non-debug instruction in the same basic /// block as 'this', or nullptr if no such instruction exists. Skip any pseudo /// operations if \c SkipPseudoOp is true. diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index c07fee58e4bdb46..f9c9beb4478e9f2 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -4385,6 +4385,277 @@ class UnreachableInst : public Instruction { } }; +//===----------------------------------------------------------------------===// +// DetachInst Class +//===----------------------------------------------------------------------===// + +//===--------------------------------------------------------------------------- +/// DetachInst - Detach instruction +/// +class DetachInst : public Instruction { + using UnwindDestField = BoolBitfieldElementT<0>; + + /// Ops list - The operands are ordered: + /// SyncRegion, Detached, Continue[, Unwind] + DetachInst(const DetachInst &DI); + void AssertOK(); + // DetachInst constructors (where {D, C, U} are blocks and SR is a token): + // DetachInst(BB *D, BB *C, Value *SR) - 'detach SR, D, C' + // DetachInst(BB *D, BB *C, Value *SR, Inst *I) + // - 'detach SR, D, C', insert before I + // DetachInst(BB *D, BB *C, Value *SR, BB *I) + // - 'detach SR, D, C', insert at end + DetachInst(BasicBlock *Detached, BasicBlock *Continue, Value *SyncRegion, + Instruction *InsertBefore = nullptr); + DetachInst(BasicBlock *Detached, BasicBlock *Continue, Value *SyncRegion, + BasicBlock *InsertAtEnd); + // DetachInst(BB *D, BB *C, BB *U, Value *SR) - 'detach SR, D, C, U' + // DetachInst(BB *D, BB *C, BB *U, Value *SR, Inst *I) + // - 'detach SR, D, C, U', insert before I + // DetachInst(BB *D, BB *C, BB *U, Value *SR, BB *I) + // - 'detach SR, D, C, U', insert at end + DetachInst(BasicBlock *Detached, BasicBlock *Continue, BasicBlock *Unwind, + Value *SyncRegion, Instruction *InsertBefore = nullptr); + DetachInst(BasicBlock *Detached, BasicBlock *Continue, BasicBlock *Unwind, + Value *SyncRegion, BasicBlock *InsertAtEnd); + +protected: + // Note: Instruction needs to be a friend here to call cloneImpl. + friend class Instruction; + DetachInst *cloneImpl() const; + +public: + static DetachInst *Create(BasicBlock *Detached, BasicBlock *Continue, + Value *SyncRegion, + Instruction *InsertBefore = nullptr) { + return new(3) DetachInst(Detached, Continue, SyncRegion, InsertBefore); + } + static DetachInst *Create(BasicBlock *Detached, BasicBlock *Continue, + Value *SyncRegion, BasicBlock *InsertAtEnd) { + return new(3) DetachInst(Detached, Continue, SyncRegion, InsertAtEnd); + } + static DetachInst *Create(BasicBlock *Detached, BasicBlock *Continue, + BasicBlock *Unwind, Value *SyncRegion, + Instruction *InsertBefore = nullptr) { + return new(4) DetachInst(Detached, Continue, Unwind, SyncRegion, + InsertBefore); + } + static DetachInst *Create(BasicBlock *Detached, BasicBlock *Continue, + BasicBlock *Unwind, Value *SyncRegion, + BasicBlock *InsertAtEnd) { + return new(4) DetachInst(Detached, Continue, Unwind, SyncRegion, + InsertAtEnd); + } + + /// Provide fast operand accessors + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); + + Value *getSyncRegion() const { + return Op<-1>(); + } + + void setSyncRegion(Value *SyncRegion) { + Op<-1>() = SyncRegion; + } + + unsigned getNumSuccessors() const { return 2 + hasUnwindDest(); } + + BasicBlock *getSuccessor(unsigned i) const { + assert(i < getNumSuccessors() && "Successor # out of range for detach!"); + return cast((&Op<-2>() - i)->get()); + } + + void setSuccessor(unsigned idx, BasicBlock *NewSucc) { + assert(idx < getNumSuccessors() && "Successor # out of range for detach!"); + *(&Op<-2>() - idx) = reinterpret_cast(NewSucc); + } + + // Methods for support type inquiry through isa, cast, and dyn_cast: + static inline bool classof(const Instruction *I) { + return (I->getOpcode() == Instruction::Detach); + } + static inline bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + + BasicBlock *getDetached() const { return getSuccessor(0); } + BasicBlock *getContinue() const { return getSuccessor(1); } + bool hasUnwindDest() const { return getSubclassData(); } + BasicBlock *getUnwindDest() const { + if (hasUnwindDest()) + return getSuccessor(2); + return nullptr; + } + void setUnwindDest(BasicBlock *Unwind) { + assert(hasUnwindDest() && Unwind && + "Invalid unwind destination for detach."); + setSuccessor(2, Unwind); + } + + /// Get the landingpad instruction from the landing pad + /// block (the unwind destination). + LandingPadInst *getLandingPadInst() const; + +private: + void init(Value *SyncRegion, BasicBlock *Detached, BasicBlock *Continue, + BasicBlock *Unwind = nullptr); +}; + +template <> +struct OperandTraits : public VariadicOperandTraits { +}; + +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachInst, Value) + +//===----------------------------------------------------------------------===// +// ReattachInst Class +//===----------------------------------------------------------------------===// + +//===--------------------------------------------------------------------------- +/// ReattachInst - Reattach instruction. This instruction terminates +/// a subCFG and has no successors. The DetachContinue field +/// maintains the continue block after the detach instruction +/// corresponding to this reattach. +/// +class ReattachInst : public Instruction { + ReattachInst(const ReattachInst &RI); + void AssertOK(); + // ReattachInst constructors (where C is a block and SR is a token): + // ReattachInst(BB *C, Value *SR) - 'reattach SR, C' + // ReattachInst(BB *C, Value *SR, Inst *I) - 'reattach SR, C', insert before I + // ReattachInst(BB *C, Value *SR, BB *I) - 'reattach SR, C', insert at end + explicit ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion, + Instruction *InsertBefore = nullptr); + ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion, + BasicBlock *InsertAtEnd); +protected: + // Note: Instruction needs to be a friend here to call cloneImpl. + friend class Instruction; + ReattachInst *cloneImpl() const; + +public: + static ReattachInst *Create(BasicBlock *DetachContinue, Value *SyncRegion, + Instruction *InsertBefore = nullptr) { + return new(2) ReattachInst(DetachContinue, SyncRegion, InsertBefore); + } + + static ReattachInst *Create(BasicBlock *DetachContinue, Value *SyncRegion, + BasicBlock *InsertAtEnd) { + return new(2) ReattachInst(DetachContinue, SyncRegion, InsertAtEnd); + } + + /// Transparently provide more efficient getOperand methods. + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); + + Value *getSyncRegion() const { + return Op<-1>(); + } + + void setSyncRegion(Value *SyncRegion) { + Op<-1>() = SyncRegion; + } + + unsigned getNumSuccessors() const { return 1; } + + BasicBlock *getDetachContinue() const { + return getSuccessor(0); + } + + // Methods for support type inquiry through isa, cast, and dyn_cast: + static inline bool classof(const Instruction *I) { + return I->getOpcode() == Instruction::Reattach; + } + static inline bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + BasicBlock *getSuccessor(unsigned i) const { + assert(i < getNumSuccessors() && "Successor # out of range for reattach!"); + return cast((&Op<-2>() - i)->get()); + } + void setSuccessor(unsigned idx, BasicBlock *NewSucc) { + assert(idx < getNumSuccessors() && + "Successor # out of range for reattach!"); + *(&Op<-2>() - idx) = reinterpret_cast(NewSucc); + } +}; + +template <> +struct OperandTraits : public VariadicOperandTraits { +}; + +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ReattachInst, Value) + +//===----------------------------------------------------------------------===// +// SyncInst Class +//===----------------------------------------------------------------------===// + +//===--------------------------------------------------------------------------- +/// SyncInst - Sync instruction. +/// +class SyncInst : public Instruction { + /// Ops list - A sync looks like an unconditional branch to its continuation. + SyncInst(const SyncInst &SI); + void AssertOK(); + // SyncInst constructor (where C is a block and SR is a token): + // SyncInst(BB *C, Value *SR) - 'sync SR, C' + // SyncInst(BB *C, Value *SR, Inst *I) - 'sync SR, C' insert before I + // SyncInst(BB *C, Value *SR, BB *I) - 'sync SR, C' insert at end + explicit SyncInst(BasicBlock *Continue, Value *SyncRegion, + Instruction *InsertBefore = nullptr); + SyncInst(BasicBlock *Continue, Value *SyncRegion, + BasicBlock *InsertAtEnd); +protected: + // Note: Instruction needs to be a friend here to call cloneImpl. + friend class Instruction; + SyncInst *cloneImpl() const; + +public: + static SyncInst *Create(BasicBlock *Continue, Value *SyncRegion, + Instruction *InsertBefore = nullptr) { + return new(2) SyncInst(Continue, SyncRegion, InsertBefore); + } + static SyncInst *Create(BasicBlock *Continue, Value *SyncRegion, + BasicBlock *InsertAtEnd) { + return new(2) SyncInst(Continue, SyncRegion, InsertAtEnd); + } + + /// Transparently provide more efficient getOperand methods. + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); + + Value *getSyncRegion() const { + return Op<-1>(); + } + + void setSyncRegion(Value *SyncRegion) { + Op<-1>() = SyncRegion; + } + + unsigned getNumSuccessors() const { return 1; } + + // Methods for support type inquiry through isa, cast, and dyn_cast: + static inline bool classof(const Instruction *I) { + return I->getOpcode() == Instruction::Sync; + } + static inline bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + + BasicBlock *getSuccessor(unsigned i) const { + assert(i < getNumSuccessors() && "Successor # out of range for sync!"); + return cast((&Op<-2>() - i)->get()); + } + void setSuccessor(unsigned idx, BasicBlock *NewSucc) { + assert(idx < getNumSuccessors() && "Successor # out of range for sync!"); + *(&Op<-2>() - idx) = reinterpret_cast(NewSucc); + } +}; + +template <> +struct OperandTraits : public VariadicOperandTraits { +}; + +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SyncInst, Value) + //===----------------------------------------------------------------------===// // TruncInst Class //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index 94c8fa092f45e6e..7926196233dfdf8 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -102,6 +102,7 @@ class IntrinsicInst : public CallInst { bool isAssumeLikeIntrinsic() const { switch (getIntrinsicID()) { default: break; + case Intrinsic::annotation: case Intrinsic::assume: case Intrinsic::sideeffect: case Intrinsic::pseudoprobe: @@ -111,12 +112,31 @@ class IntrinsicInst : public CallInst { case Intrinsic::dbg_label: case Intrinsic::invariant_start: case Intrinsic::invariant_end: + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: + case Intrinsic::is_constant: case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: case Intrinsic::experimental_noalias_scope_decl: case Intrinsic::objectsize: case Intrinsic::ptr_annotation: case Intrinsic::var_annotation: + case Intrinsic::experimental_gc_result: + case Intrinsic::experimental_gc_relocate: + case Intrinsic::coro_alloc: + case Intrinsic::coro_begin: + case Intrinsic::coro_free: + case Intrinsic::coro_end: + case Intrinsic::coro_frame: + case Intrinsic::coro_size: + case Intrinsic::coro_suspend: + case Intrinsic::coro_subfn_addr: + case Intrinsic::syncregion_start: + case Intrinsic::taskframe_create: + case Intrinsic::taskframe_use: + case Intrinsic::taskframe_end: + case Intrinsic::taskframe_load_guard: + case Intrinsic::sync_unwind: return true; } return false; diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index b4e758136b39fb5..ba575bbb20e1e17 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1757,6 +1757,64 @@ def int_coro_subfn_addr : DefaultAttrsIntrinsic< [IntrReadMem, IntrArgMemOnly, ReadOnly>, NoCapture>]>; +///===-------------------------- Tapir Intrinsics -------------------------===// +// +def int_syncregion_start + : Intrinsic<[llvm_token_ty], [], [IntrArgMemOnly, IntrWillReturn]>; + +def int_tapir_runtime_start + : Intrinsic<[llvm_token_ty], [], [IntrArgMemOnly, IntrWillReturn]>; + +def int_tapir_runtime_end + : Intrinsic<[], [llvm_token_ty], [IntrArgMemOnly, IntrWillReturn]>; + +// Intrinsics for taskframes. + +// Marker for the start of a taskframe. +def int_taskframe_create + : Intrinsic<[llvm_token_ty], [], [IntrArgMemOnly, IntrWillReturn]>; + +// Marker placed in detached blocks (i.e., task-entry blocks) to +// identify the taskframe used by the spawned task. +def int_taskframe_use + : Intrinsic<[], [llvm_token_ty], [IntrArgMemOnly, IntrWillReturn]>; + +// Marker for the end of a taskframe. +def int_taskframe_end + : Intrinsic<[], [llvm_token_ty], [IntrArgMemOnly, IntrWillReturn]>; + +// Marker for the end of a taskframe along exception-handling unwind +// paths. +def int_taskframe_resume : Intrinsic<[], [llvm_token_ty, llvm_any_ty], + [IntrArgMemOnly, IntrWillReturn, Throws]>; + +// Guard intrinsic to prevent illegal code motion of loads from memory +// locations stored in spawned subtasks. +def int_taskframe_load_guard + : Intrinsic<[], [llvm_anyptr_ty], + [IntrArgMemOnly, IntrWillReturn, NoCapture>]>; + +// Marker for the end of a spawned task along exception-handling +// unwind paths. +def int_detached_rethrow : Intrinsic<[], [llvm_token_ty, llvm_any_ty], + [IntrArgMemOnly, IntrWillReturn, Throws]>; + +// Invokable intrinsic to keep track of a landingpad associated with a +// sync. +def int_sync_unwind + : Intrinsic<[], [llvm_token_ty], [IntrArgMemOnly, IntrWillReturn, Throws]>; + +// Intrinsic to get the grainsize of a Tapir loop. +def int_tapir_loop_grainsize + : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], + [IntrNoMem, IntrWillReturn, IntrSpeculatable]>; + +// Intrinsic to get the frame address of a spawned task. Tapir +// lowering transforms this intrinsic into ordinary frameaddress +// intrinsics. +def int_task_frameaddress + : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrWillReturn]>; + ///===-------------------------- Other Intrinsics --------------------------===// // // TODO: We should introduce a new memory kind fo traps (and other side effects diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 13be9c11f010728..72c5b1a0d589211 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -48,6 +48,10 @@ void initializeGlobalISel(PassRegistry&); /// Initialize all passes linked into the CodeGen library. void initializeTarget(PassRegistry&); +/// Initialize all passes linked into the TapirOpts library. +void initializeTapirOpts(PassRegistry&); + +void initializeAAEvalLegacyPassPass(PassRegistry&); void initializeAAResultsWrapperPassPass(PassRegistry&); void initializeAlwaysInlinerLegacyPassPass(PassRegistry&); void initializeAssignmentTrackingAnalysisPass(PassRegistry &); @@ -76,12 +80,16 @@ void initializeCallGraphViewerPass(PassRegistry&); void initializeCallGraphWrapperPassPass(PassRegistry&); void initializeCheckDebugMachineModulePass(PassRegistry &); void initializeCodeGenPrepareLegacyPassPass(PassRegistry &); +void initializeCodeGenPreparePass(PassRegistry&); void initializeComplexDeinterleavingLegacyPassPass(PassRegistry&); +void initializeComprehensiveStaticInstrumentationLegacyPassPass(PassRegistry&); void initializeConstantHoistingLegacyPassPass(PassRegistry&); void initializeCycleInfoWrapperPassPass(PassRegistry &); void initializeDAEPass(PassRegistry&); void initializeDAHPass(PassRegistry&); void initializeDCELegacyPassPass(PassRegistry&); +void initializeDRFAAWrapperPassPass(PassRegistry&); +void initializeDRFScopedNoAliasWrapperPassPass(PassRegistry&); void initializeDeadMachineInstructionElimPass(PassRegistry&); void initializeDebugifyMachineModulePass(PassRegistry &); void initializeDependenceAnalysisWrapperPassPass(PassRegistry&); @@ -165,7 +173,10 @@ void initializeLoopExtractorLegacyPassPass(PassRegistry &); void initializeLoopInfoWrapperPassPass(PassRegistry&); void initializeLoopPassPass(PassRegistry&); void initializeLoopSimplifyPass(PassRegistry&); +void initializeLoopSpawningPass(PassRegistry&); +void initializeLoopSpawningTIPass(PassRegistry&); void initializeLoopStrengthReducePass(PassRegistry&); +void initializeLoopStripMinePass(PassRegistry&); void initializeLoopUnrollPass(PassRegistry&); void initializeLowerAtomicLegacyPassPass(PassRegistry&); void initializeLowerConstantIntrinsicsPass(PassRegistry&); @@ -174,6 +185,7 @@ void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &); void initializeLowerIntrinsicsPass(PassRegistry&); void initializeLowerInvokeLegacyPassPass(PassRegistry&); void initializeLowerSwitchLegacyPassPass(PassRegistry &); +void initializeLowerTapirToTargetPass(PassRegistry&); void initializeKCFIPass(PassRegistry &); void initializeMIRAddFSDiscriminatorsPass(PassRegistry &); void initializeMIRCanonicalizerPass(PassRegistry &); @@ -274,6 +286,7 @@ void initializeScalarizeMaskedMemIntrinLegacyPassPass(PassRegistry &); void initializeScavengerTestPass(PassRegistry&); void initializeScopedNoAliasAAWrapperPassPass(PassRegistry&); void initializeSeparateConstOffsetFromGEPLegacyPassPass(PassRegistry &); +void initializeSerializeSmallTasksPass(PassRegistry&); void initializeShadowStackGCLoweringPass(PassRegistry&); void initializeShrinkWrapPass(PassRegistry&); void initializeSingleLoopExtractorPass(PassRegistry&); @@ -297,6 +310,11 @@ void initializeTailDuplicatePass(PassRegistry&); void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&); void initializeTargetPassConfigPass(PassRegistry&); void initializeTargetTransformInfoWrapperPassPass(PassRegistry&); +void initializeTapirCleanupPass(PassRegistry&); +void initializeTapirRaceDetectWrapperPassPass(PassRegistry&); +void initializeTaskInfoWrapperPassPass(PassRegistry&); +void initializeTaskCanonicalizePass(PassRegistry&); +void initializeTaskSimplifyPass(PassRegistry&); void initializeTLSVariableHoistLegacyPassPass(PassRegistry &); void initializeTwoAddressInstructionLegacyPassPass(PassRegistry &); void initializeTypeBasedAAWrapperPassPass(PassRegistry&); diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h index 482b6e55a19d353..7317b9c114aa4f0 100644 --- a/llvm/include/llvm/LTO/Config.h +++ b/llvm/include/llvm/LTO/Config.h @@ -23,6 +23,7 @@ #include "llvm/Passes/PassBuilder.h" #include "llvm/Support/CodeGen.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Tapir/TapirTargetIDs.h" #include #include @@ -94,6 +95,12 @@ struct Config { /// 'ELF' scheme. VisScheme VisibilityScheme = FromPrevailing; + /// Target for lowering Tapir constructs + TapirTargetID TapirTarget = TapirTargetID::None; + + // Path to OpenCilk runtime bitcode file. + std::string OpenCilkABIBitcodeFile; + /// If this field is set, the set of passes run in the middle-end optimizer /// will be the one specified by the string. Only works with the new pass /// manager as the old one doesn't have this ability. diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h index 311d38e8a751f7a..73572d35115f547 100644 --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -19,6 +19,7 @@ #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CallPrinter.h" +#include "llvm/Analysis/DataRaceFreeAliasAnalysis.h" #include "llvm/Analysis/DomPrinter.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Passes.h" @@ -42,6 +43,7 @@ #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/Scalarizer.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Tapir.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include @@ -64,11 +66,13 @@ namespace { (void) llvm::createSCEVAAWrapperPass(); (void) llvm::createTypeBasedAAWrapperPass(); (void) llvm::createScopedNoAliasAAWrapperPass(); + (void) llvm::createDRFAAWrapperPass(); (void) llvm::createBreakCriticalEdgesPass(); (void) llvm::createCallGraphDOTPrinterPass(); (void) llvm::createCallGraphViewerPass(); (void) llvm::createCFGSimplificationPass(); (void) llvm::createStructurizeCFGPass(); + (void) llvm::createDRFScopedNoAliasWrapperPass(); (void) llvm::createDeadArgEliminationPass(); (void) llvm::createDeadCodeEliminationPass(); (void) llvm::createDependenceAnalysisWrapperPass(); @@ -87,12 +91,16 @@ namespace { (void) llvm::createLazyValueInfoPass(); (void) llvm::createLoopExtractorPass(); (void) llvm::createLoopSimplifyPass(); + (void) llvm::createLoopSimplifyCFGPass(); + (void) llvm::createLoopSpawningTIPass(); (void) llvm::createLoopStrengthReducePass(); + (void) llvm::createLoopStripMinePass(); (void) llvm::createLoopUnrollPass(); (void) llvm::createLowerConstantIntrinsicsPass(); (void) llvm::createLowerGlobalDtorsLegacyPass(); (void) llvm::createLowerInvokePass(); (void) llvm::createLowerSwitchPass(); + (void) llvm::createLowerTapirToTargetPass(); (void) llvm::createNaryReassociatePass(); (void) llvm::createObjCARCContractPass(); (void) llvm::createPromoteMemoryToRegisterPass(); @@ -138,6 +146,8 @@ namespace { (void) llvm::createUnifyLoopExitsPass(); (void) llvm::createFixIrreduciblePass(); (void)llvm::createSelectOptimizePass(); + (void) llvm::createTaskCanonicalizePass(); + (void) llvm::createTaskSimplifyPass(); (void)new llvm::ScalarEvolutionWrapperPass(); llvm::Function::Create(nullptr, llvm::GlobalValue::ExternalLinkage)->viewCFGOnly(); diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h index 474a19531ff5d3f..ec681e48369b8a5 100644 --- a/llvm/include/llvm/Passes/PassBuilder.h +++ b/llvm/include/llvm/Passes/PassBuilder.h @@ -58,6 +58,10 @@ class PipelineTuningOptions { /// level. bool SLPVectorization; + /// Tuning option to enable/disable loop stripmining. Its default value + /// is that of the flag: `-stripmine-loops`. + bool LoopStripmine; + /// Tuning option to enable/disable loop unrolling. Its default value is true. bool LoopUnrolling; @@ -240,6 +244,24 @@ class PassBuilder { buildModuleOptimizationPipeline(OptimizationLevel Level, ThinOrFullLTOPhase LTOPhase); + /// Construct the pipeline for lowering Tapir loops to a target parallel + /// runtime. + /// + /// This pipeline is intended to be used early within + /// buildTapirLoweringPipeline at Level > O0 or run on its own for debugging + /// purposes. + ModulePassManager buildTapirLoopLoweringPipeline(OptimizationLevel Level, + ThinOrFullLTOPhase Phase); + + /// Construct the pipeline for lowering Tapir constructs to a target parallel + /// runtime. + /// + /// This pipeline is intended to be used with the PerModuleDefault pipeline + /// and various LTO pipelines to lower Tapir constructs. This pipeline is + /// expected to run late in the parent pipelines. + ModulePassManager buildTapirLoweringPipeline(OptimizationLevel Level, + ThinOrFullLTOPhase Phase); + /// Build a per-module default optimization pipeline. /// /// This provides a good default optimization pipeline for per-module @@ -247,7 +269,8 @@ class PassBuilder { /// typically correspond to frontend "-O[123]" options for optimization /// levels \c O1, \c O2 and \c O3 resp. ModulePassManager buildPerModuleDefaultPipeline(OptimizationLevel Level, - bool LTOPreLink = false); + bool LTOPreLink = false, + bool LowerTapir = false); /// Build a fat object default optimization pipeline. /// @@ -274,7 +297,8 @@ class PassBuilder { /// buildThinLTOPreLinkDefaultPipeline, and the two coordinate closely. ModulePassManager buildThinLTODefaultPipeline(OptimizationLevel Level, - const ModuleSummaryIndex *ImportSummary); + const ModuleSummaryIndex *ImportSummary, + bool LowerTapir = false); /// Build a pre-link, LTO-targeting default optimization pipeline to a pass /// manager. @@ -292,13 +316,15 @@ class PassBuilder { /// when IR coming into the LTO phase was first run through \c /// buildLTOPreLinkDefaultPipeline, and the two coordinate closely. ModulePassManager buildLTODefaultPipeline(OptimizationLevel Level, - ModuleSummaryIndex *ExportSummary); + ModuleSummaryIndex *ExportSummary, + bool LowerTapir = false); /// Build an O0 pipeline with the minimal semantically required passes. /// /// This should only be used for non-LTO and LTO pre-link pipelines. ModulePassManager buildO0DefaultPipeline(OptimizationLevel Level, - bool LTOPreLink = false); + bool LTOPreLink = false, + bool LowerTapir = false); /// Build the default `AAManager` with the default alias analysis pipeline /// registered. @@ -397,6 +423,11 @@ class PassBuilder { /// Print pass names. void printPassNames(raw_ostream &OS); + /// Add optimizations to run immediately after an + /// instrumentation pass, such as CilkSanitizer or CSI. + ModulePassManager + buildPostCilkInstrumentationPipeline(OptimizationLevel Level); + /// Register a callback for a default optimizer pipeline extension /// point /// @@ -519,6 +550,26 @@ class PassBuilder { FullLinkTimeOptimizationLastEPCallbacks.push_back(C); } + /// Register a callback for a default optimizer pipeline extension point. + /// + /// This extension point allows adding passes after optimizations have been + /// performed on the Tapir IR, but before Tapir constructs are lowered to a + /// target runtime. + void registerTapirLateEPCallback( + const std::function &C) { + TapirLateEPCallbacks.push_back(C); + } + + /// Register a callback for a default optimizer pipeline extension point. + /// + /// This extension point allows adding passes after optimizations have been + /// performed on the Tapir IR, but before Tapir constructs are lowered to a + /// target runtime. + void registerTapirLoopEndEPCallback( + const std::function &C) { + TapirLoopEndEPCallbacks.push_back(C); + } + /// Register a callback for parsing an AliasAnalysis Name to populate /// the given AAManager \p AA void registerParseAACallback( @@ -639,6 +690,10 @@ class PassBuilder { OptimizationLevel Level); void invokePipelineEarlySimplificationEPCallbacks(ModulePassManager &MPM, OptimizationLevel Level); + void invokeTapirLateEPCallbacks(ModulePassManager &MPM, + OptimizationLevel Level); + void invokeTapirLoopEndEPCallbacks(ModulePassManager &MPM, + OptimizationLevel Level); static bool checkParametrizedPassName(StringRef Name, StringRef PassName) { if (!Name.consume_front(PassName)) @@ -761,6 +816,10 @@ class PassBuilder { FullLinkTimeOptimizationEarlyEPCallbacks; SmallVector, 2> FullLinkTimeOptimizationLastEPCallbacks; + SmallVector, 2> + TapirLateEPCallbacks; + SmallVector, 2> + TapirLoopEndEPCallbacks; SmallVector, 2> PipelineStartEPCallbacks; SmallVector, 2> diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h index d560ca648132c94..77c87f48e00f2d0 100644 --- a/llvm/include/llvm/Support/GenericLoopInfo.h +++ b/llvm/include/llvm/Support/GenericLoopInfo.h @@ -262,11 +262,12 @@ template class LoopBase { /// Return all blocks inside the loop that have successors outside of the /// loop. These are the blocks _inside of the current loop_ which branch out. /// The returned list is always unique. - void getExitingBlocks(SmallVectorImpl &ExitingBlocks) const; + void getExitingBlocks(SmallVectorImpl &ExitingBlocks, + bool IgnoreDetachUnwind = false) const; /// If getExitingBlocks would return exactly one block, return that block. /// Otherwise return null. - BlockT *getExitingBlock() const; + BlockT *getExitingBlock(bool IgnoreDetachUnwind = false) const; /// Return all of the successor blocks of this loop. These are the blocks /// _outside of the current loop_ which are branched to. diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h index d19022729ace32d..0b016584a285e56 100644 --- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h +++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h @@ -31,7 +31,7 @@ namespace llvm { /// template void LoopBase::getExitingBlocks( - SmallVectorImpl &ExitingBlocks) const { + SmallVectorImpl &ExitingBlocks, bool IgnoreDetachUnwind) const { assert(!isInvalid() && "Loop not in a valid state!"); for (const auto BB : blocks()) for (auto *Succ : children(BB)) @@ -45,7 +45,8 @@ void LoopBase::getExitingBlocks( /// getExitingBlock - If getExitingBlocks would return exactly one block, /// return that block. Otherwise return null. template -BlockT *LoopBase::getExitingBlock() const { +BlockT * +LoopBase::getExitingBlock(bool IgnoreDetachUnwind) const { assert(!isInvalid() && "Loop not in a valid state!"); auto notInLoop = [&](BlockT *BB) { return !contains(BB); }; auto isExitBlock = [&](BlockT *BB, bool AllowRepeats) -> BlockT * { diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h index 969c2cd12f3f089..b2a69448d8c7d1d 100644 --- a/llvm/include/llvm/Transforms/Instrumentation.h +++ b/llvm/include/llvm/Transforms/Instrumentation.h @@ -161,6 +161,29 @@ struct SanitizerCoverageOptions { SanitizerCoverageOptions() = default; }; +// Options for comprehensive static instrumentation +struct CSIOptions { + bool InstrumentFuncEntryExit = true; + bool InstrumentLoops = true; + bool InstrumentBasicBlocks = true; + bool InstrumentMemoryAccesses = true; + bool InstrumentCalls = true; + bool InstrumentAtomics = true; + bool InstrumentMemIntrinsics = true; + bool InstrumentTapir = true; + bool InstrumentAllocas = true; + bool InstrumentAllocFns = true; + bool Interpose = true; + + // TODO: With recent changes LLVM's JIT technology, the jitMode flag no longer + // seems to be necessary. + bool jitMode = false; + bool CallsMayThrow = true; + bool CallsTerminateBlocks = true; + + CSIOptions() = default; +}; + /// Calculate what to divide by to scale counts. /// /// Given the maximum count, calculate a divisor that will scale all the diff --git a/llvm/include/llvm/Transforms/Instrumentation/CSI.h b/llvm/include/llvm/Transforms/Instrumentation/CSI.h new file mode 100644 index 000000000000000..c6deb5561ff2b0f --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/CSI.h @@ -0,0 +1,1619 @@ +//===- CSI.h - CSI implementation structures and hooks -------*- C++ -*----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is part of CSI, a framework that provides comprehensive static +// instrumentation. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_CSI_H +#define LLVM_TRANSFORMS_CSI_H + +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Instrumentation/SurgicalInstrumentationConfig.h" + +namespace llvm { + +class Loop; +class LoopInfo; +class Spindle; +class Task; +class TaskInfo; +class ScalarEvolution; + +static const char *const CsiRtUnitInitName = "__csirt_unit_init"; +static const char *const CsiRtUnitCtorName = "csirt.unit_ctor"; +static const char *const CsiFunctionBaseIdName = "__csi_unit_func_base_id"; +static const char *const CsiFunctionExitBaseIdName = + "__csi_unit_func_exit_base_id"; +static const char *const CsiBasicBlockBaseIdName = "__csi_unit_bb_base_id"; +static const char *const CsiLoopBaseIdName = "__csi_unit_loop_base_id"; +static const char *const CsiLoopExitBaseIdName = "__csi_unit_loop_exit_base_id"; +static const char *const CsiCallsiteBaseIdName = "__csi_unit_callsite_base_id"; +static const char *const CsiLoadBaseIdName = "__csi_unit_load_base_id"; +static const char *const CsiStoreBaseIdName = "__csi_unit_store_base_id"; +static const char *const CsiAllocaBaseIdName = "__csi_unit_alloca_base_id"; +static const char *const CsiDetachBaseIdName = "__csi_unit_detach_base_id"; +static const char *const CsiTaskBaseIdName = "__csi_unit_task_base_id"; +static const char *const CsiTaskExitBaseIdName = "__csi_unit_task_exit_base_id"; +static const char *const CsiDetachContinueBaseIdName = + "__csi_unit_detach_continue_base_id"; +static const char *const CsiSyncBaseIdName = "__csi_unit_sync_base_id"; +static const char *const CsiAllocFnBaseIdName = "__csi_unit_allocfn_base_id"; +static const char *const CsiFreeBaseIdName = "__csi_unit_free_base_id"; + +static const char *const CsiDefaultDebugNamePrefix = "__csi_unit_function_name_"; + +static const char *const CsiUnitSizeTableName = "__csi_unit_size_table"; +static const char *const CsiUnitFedTableName = "__csi_unit_fed_table"; +static const char *const CsiFuncIdVariablePrefix = "__csi_func_id_"; +static const char *const CsiUnitFedTableArrayName = "__csi_unit_fed_tables"; +static const char *const CsiUnitSizeTableArrayName = "__csi_unit_size_tables"; +static const char *const CsiInitCallsiteToFunctionName = + "__csi_init_callsite_to_function"; +static const char *const CsiDisableInstrumentationName = + "__csi_disable_instrumentation"; + +using csi_id_t = int64_t; +static const csi_id_t CsiUnknownId = -1; +static const csi_id_t CsiCallsiteUnknownTargetId = CsiUnknownId; +// See clang/lib/CodeGen/CodeGenModule.h: +static const int CsiUnitCtorPriority = 0; + +/// Maintains a mapping from CSI ID to static data for that ID. +class ForensicTable { +public: + ForensicTable() {} + ForensicTable(Module &M, StringRef BaseIdName, StringRef TableName = "", + bool UseExistingBaseId = true); + + /// The number of entries in this forensic table + uint64_t size() const { return IdCounter; } + + /// Get the local ID of the given Value. + uint64_t getId(const Value *V); + + /// The GlobalVariable holding the base ID for this forensic table. + GlobalVariable *baseId() const { return BaseId; } + + /// Converts a local to global ID conversion. + /// + /// This is done by using the given IRBuilder to insert a load to the base ID + /// global variable followed by an add of the base value and the local ID. + /// + /// \returns A Value holding the global ID corresponding to the + /// given local ID. + Value *localToGlobalId(uint64_t LocalId, IRBuilder<> &IRB) const; + + /// Helper function to get or create a string for a forensic-table entry. + static Constant *getObjectStrGV(Module &M, StringRef Str, const Twine GVName); + +protected: + /// The GlobalVariable holding the base ID for this FED table. + GlobalVariable *BaseId = nullptr; + /// Counter of local IDs used so far. + uint64_t IdCounter = 0; + /// Map of Value to Local ID. + DenseMap ValueToLocalIdMap; + StringRef TableName; +}; + +/// Maintains a mapping from CSI ID to front-end data for that ID. +/// +/// The front-end data currently is the source location that a given +/// CSI ID corresponds to. +class FrontEndDataTable : public ForensicTable { +public: + FrontEndDataTable() : ForensicTable() {} + FrontEndDataTable(Module &M, StringRef BaseIdName, + StringRef TableName = CsiUnitFedTableName, + StringRef DebugNamePrefix = CsiDefaultDebugNamePrefix, + bool UseExistingBaseId = true) + : ForensicTable(M, BaseIdName, TableName, UseExistingBaseId), + DebugNamePrefix(DebugNamePrefix) {} + + /// The number of entries in this FED table + uint64_t size() const { return LocalIdToSourceLocationMap.size(); } + + /// Add the given Function to this FED table. + /// \returns The local ID of the Function. + uint64_t add(const Function &F); + + /// Add the given BasicBlock to this FED table. + /// \returns The local ID of the BasicBlock. + uint64_t add(const BasicBlock &BB); + + /// Add the given Instruction to this FED table. + /// \returns The local ID of the Instruction. + uint64_t add(const Instruction &I, const StringRef &RealName = ""); + + /// Get the Type for a pointer to a FED table entry. + /// + /// A FED table entry is just a source location. + static PointerType *getPointerType(LLVMContext &C); + + /// Insert this FED table into the given Module. + /// + /// The FED table is constructed as a ConstantArray indexed by local + /// IDs. The runtime is responsible for performing the mapping that + /// allows the table to be indexed by global ID. + Constant *insertIntoModule(Module &M) const; + +private: + struct SourceLocation { + StringRef Name; + int32_t Line; + int32_t Column; + StringRef Filename; + StringRef Directory; + }; + StringRef DebugNamePrefix; + + /// Map of local ID to SourceLocation. + DenseMap LocalIdToSourceLocationMap; + + /// Create a struct type to match the "struct SourceLocation" type. + /// (and the source_loc_t type in csi.h). + static StructType *getSourceLocStructType(LLVMContext &C); + + /// Append the debug information to the table, assigning it the next + /// available ID. + /// + /// \returns The local ID of the appended information. + /// @{ + void add(uint64_t ID, const DILocation *Loc, const StringRef &RealName = ""); + void add(uint64_t ID, const DISubprogram *Subprog); + /// @} + + /// Append the line and file information to the table, assigning it + /// the next available ID. + /// + /// \returns The new local ID of the DILocation. + void add(uint64_t ID, int32_t Line = -1, int32_t Column = -1, + StringRef Filename = "", StringRef Directory = "", + StringRef Name = ""); +}; + +/// Maintains a mapping from CSI ID of a basic block to the size of that basic +/// block in LLVM IR instructions. +class SizeTable : public ForensicTable { +public: + SizeTable() : ForensicTable() {} + SizeTable(Module &M, StringRef BaseIdName) : ForensicTable(M, BaseIdName) {} + + /// The number of entries in this table + uint64_t size() const { return LocalIdToSizeMap.size(); } + + /// Add the given basic block to this table. + /// \returns The local ID of the basic block. + uint64_t add(const BasicBlock &BB, TargetTransformInfo *TTI); + + /// Get the Type for a pointer to a table entry. + /// + /// A table entry is just a source location. + static PointerType *getPointerType(LLVMContext &C); + + /// Insert this table into the given Module. + /// + /// The table is constructed as a ConstantArray indexed by local IDs. The + /// runtime is responsible for performing the mapping that allows the table to + /// be indexed by global ID. + Constant *insertIntoModule(Module &M) const; + +private: + struct SizeInformation { + // This count includes every IR instruction. + int32_t FullIRSize; + // This count excludes IR instructions that don't lower to any real + // instructions, e.g., PHI instructions, debug intrinsics, and lifetime + // intrinsics. + int32_t NonEmptyIRSize; + }; + + /// Map of local ID to size. + DenseMap LocalIdToSizeMap; + + /// Create a struct type to match the "struct SourceLocation" type. + /// (and the source_loc_t type in csi.h). + static StructType *getSizeStructType(LLVMContext &C); + + /// Append the size information to the table. + void add(uint64_t ID, int32_t FullIRSize = 0, int32_t NonEmptyIRSize = 0); +}; + +/// Represents a property value passed to hooks. +class CsiProperty { +public: + CsiProperty() {} + + virtual ~CsiProperty() {} + + /// Return the coerced type of a property. + /// + /// TODO: Right now, this function simply returns a 64-bit integer. Although + /// this solution works for x86_64, it should be generalized to handle other + /// architectures in the future. + static Type *getCoercedType(LLVMContext &C, StructType *Ty) { + // Must match the definition of property type in csi.h + // return StructType::get(IntegerType::get(C, 64), + // nullptr); + // We return an integer type, rather than a struct type, to deal with x86_64 + // type coercion on struct bit fields. + return IntegerType::get(C, 64); + } + + /// Return a constant value holding this property. + virtual Constant *getValueImpl(LLVMContext &C) const = 0; + + Constant *getValue(LLVMContext &C) const { + return getValueImpl(C); + } + + Constant *getValue(IRBuilder<> &IRB) const { + return getValueImpl(IRB.getContext()); + } +}; + +class CsiFuncProperty : public CsiProperty { +public: + CsiFuncProperty() { PropValue.Bits = 0; } + + /// Return the Type of a property. + static Type *getType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return CsiProperty::getCoercedType( + C, StructType::get(IntegerType::get(C, PropBits.NumSyncReg), + IntegerType::get(C, PropBits.MaySpawn), + IntegerType::get(C, PropBits.Padding))); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get(StructTy, + // ConstantInt::get(IntegerType::get(C, 64), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the number of sync regions in this function. + void setNumSyncReg(unsigned v) { PropValue.Fields.NumSyncReg = v; } + + /// Set the value of the MaySpawn property. + void setMaySpawn(bool v) { PropValue.Fields.MaySpawn = v; } + + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned NumSyncReg : 8; + unsigned MaySpawn : 1; + uint64_t Padding : 55; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int NumSyncReg; + int MaySpawn; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = {8, 1, (64 - 8 - 1)}; +}; + +class CsiFuncExitProperty : public CsiProperty { +public: + CsiFuncExitProperty() { PropValue.Bits = 0; } + + /// Return the Type of a property. + static Type *getType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return CsiProperty::getCoercedType( + C, StructType::get(IntegerType::get(C, PropBits.MaySpawn), + IntegerType::get(C, PropBits.EHReturn), + IntegerType::get(C, PropBits.Padding))); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get(StructTy, + // ConstantInt::get(IntegerType::get(C, 64), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the MaySpawn property. + void setMaySpawn(bool v) { PropValue.Fields.MaySpawn = v; } + + /// Set the value of the EHReturn property. + void setEHReturn(bool v) { PropValue.Fields.EHReturn = v; } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned MaySpawn : 1; + unsigned EHReturn : 1; + uint64_t Padding : 62; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int MaySpawn; + int EHReturn; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = {1, 1, (64 - 1 - 1)}; +}; + +class CsiLoopProperty : public CsiProperty { +public: + CsiLoopProperty() { PropValue.Bits = 0; } + + /// Return the Type of a property. + static StructType *getStructType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return StructType::get(IntegerType::get(C, PropBits.IsTapirLoop), + IntegerType::get(C, PropBits.HasUniqueExitingBlock), + IntegerType::get(C, PropBits.Padding)); + } + static Type *getType(LLVMContext &C) { + return getCoercedType(C, getStructType(C)); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get(StructTy, + // ConstantInt::get(IntegerType::get(C, 64), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the IsTapirLoop property. + void setIsTapirLoop(bool v) { PropValue.Fields.IsTapirLoop = v; } + + /// Set the value of the HasUniqueExitingBlock property. + void setHasUniqueExitingBlock(bool v) { + PropValue.Fields.HasUniqueExitingBlock = v; + } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned IsTapirLoop : 1; + unsigned HasUniqueExitingBlock : 1; + uint64_t Padding : 62; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int IsTapirLoop; + int HasUniqueExitingBlock; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = {1, 1, (64 - 1 - 1)}; +}; + +class CsiLoopExitProperty : public CsiProperty { +public: + CsiLoopExitProperty() { PropValue.Bits = 0; } + + /// Return the Type of a property. + static StructType *getStructType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return StructType::get(IntegerType::get(C, PropBits.IsLatch), + IntegerType::get(C, PropBits.Padding)); + } + static Type *getType(LLVMContext &C) { + return getCoercedType(C, getStructType(C)); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get(StructTy, + // ConstantInt::get(IntegerType::get(C, 64), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the IsLandingPad property. + void setIsLatch(bool v) { PropValue.Fields.IsLatch = v; } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned IsLatch : 1; + uint64_t Padding : 63; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int IsLatch; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = {1, (64 - 1)}; +}; + +class CsiBBProperty : public CsiProperty { +public: + CsiBBProperty() { PropValue.Bits = 0; } + + /// Return the Type of a property. + static Type *getType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return CsiProperty::getCoercedType( + C, StructType::get(IntegerType::get(C, PropBits.IsLandingPad), + IntegerType::get(C, PropBits.IsEHPad), + IntegerType::get(C, PropBits.Padding))); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get(StructTy, + // ConstantInt::get(IntegerType::get(C, 64), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the IsLandingPad property. + void setIsLandingPad(bool v) { PropValue.Fields.IsLandingPad = v; } + + /// Set the value of the IsEHPad property. + void setIsEHPad(bool v) { PropValue.Fields.IsEHPad = v; } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned IsLandingPad : 1; + unsigned IsEHPad : 1; + uint64_t Padding : 62; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int IsLandingPad; + int IsEHPad; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = {1, 1, (64 - 1 - 1)}; +}; + +class CsiDetachProperty : public CsiProperty { +public: + CsiDetachProperty() { PropValue.Bits = 0; } + + /// Return the Type of a property. + static StructType *getStructType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return StructType::get(IntegerType::get(C, PropBits.ForTapirLoopBody), + IntegerType::get(C, PropBits.Padding)); + } + static Type *getType(LLVMContext &C) { + return getCoercedType(C, getStructType(C)); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get(StructTy, + // ConstantInt::get(IntegerType::get(C, 64), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the IsTapirLoopBody property. + void setForTapirLoopBody(bool v) { PropValue.Fields.ForTapirLoopBody = v; } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned ForTapirLoopBody : 1; + uint64_t Padding : 63; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int ForTapirLoopBody; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = {1, (64 - 1)}; +}; + +class CsiTaskProperty : public CsiProperty { +public: + CsiTaskProperty() { PropValue.Bits = 0; } + + /// Return the Type of a property. + static StructType *getStructType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return StructType::get(IntegerType::get(C, PropBits.IsTapirLoopBody), + IntegerType::get(C, PropBits.Padding)); + } + static Type *getType(LLVMContext &C) { + return getCoercedType(C, getStructType(C)); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get(StructTy, + // ConstantInt::get(IntegerType::get(C, 64), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the IsTapirLoop property. + void setIsTapirLoopBody(bool v) { PropValue.Fields.IsTapirLoopBody = v; } + + /// Set the number of sync regions in this function. + void setNumSyncReg(unsigned v) { PropValue.Fields.NumSyncReg = v; } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned IsTapirLoopBody : 1; + unsigned NumSyncReg : 8; + uint64_t Padding : 55; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int IsTapirLoopBody; + int NumSyncReg; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = {1, 8, (64 - 1 - 8)}; +}; + +class CsiTaskExitProperty : public CsiProperty { +public: + CsiTaskExitProperty() { PropValue.Bits = 0; } + + /// Return the Type of a property. + static StructType *getStructType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return StructType::get(IntegerType::get(C, PropBits.IsTapirLoopBody), + IntegerType::get(C, PropBits.Padding)); + } + static Type *getType(LLVMContext &C) { + return getCoercedType(C, getStructType(C)); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get(StructTy, + // ConstantInt::get(IntegerType::get(C, 64), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the IsTapirLoopBody property. + void setIsTapirLoopBody(bool v) { PropValue.Fields.IsTapirLoopBody = v; } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned IsTapirLoopBody : 1; + uint64_t Padding : 63; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int IsTapirLoopBody; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = {1, (64 - 1)}; +}; + +class CsiDetachContinueProperty : public CsiProperty { +public: + CsiDetachContinueProperty() { PropValue.Bits = 0; } + + /// Return the Type of a property. + static StructType *getStructType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return StructType::get(IntegerType::get(C, PropBits.IsUnwind), + IntegerType::get(C, PropBits.ForTapirLoopBody), + IntegerType::get(C, PropBits.Padding)); + } + static Type *getType(LLVMContext &C) { + return getCoercedType(C, getStructType(C)); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get(StructTy, + // ConstantInt::get(IntegerType::get(C, 64), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the IsUnwind property. + void setIsUnwind(bool v = true) { PropValue.Fields.IsUnwind = v; } + + /// Set the value of the ForTapirLoopBody property. + void setForTapirLoopBody(bool v = true) { + PropValue.Fields.ForTapirLoopBody = v; + } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned IsUnwind : 1; + unsigned ForTapirLoopBody : 1; + uint64_t Padding : 62; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int IsUnwind; + int ForTapirLoopBody; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = {1, 1, (64 - 1 - 1)}; +}; + +class CsiCallProperty : public CsiProperty { +public: + CsiCallProperty() { PropValue.Bits = 0; } + + /// Return the Type of a property. + static Type *getType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return CsiProperty::getCoercedType( + C, StructType::get(IntegerType::get(C, PropBits.IsIndirect), + IntegerType::get(C, PropBits.IsUnwind), + IntegerType::get(C, PropBits.Padding))); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get( + // StructTy, + // ConstantInt::get(IntegerType::get(C, PropBits.IsIndirect), + // PropValue.IsIndirect), + // ConstantInt::get(IntegerType::get(C, PropBits.Padding), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the IsIndirect property. + void setIsIndirect(bool v) { PropValue.Fields.IsIndirect = v; } + /// Set the value of the IsIndirect property. + void setIsUnwind(bool v = true) { PropValue.Fields.IsUnwind = v; } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned IsIndirect : 1; + unsigned IsUnwind : 1; + uint64_t Padding : 62; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int IsIndirect; + int IsUnwind; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = {1, 1, (64 - 1 - 1)}; +}; + +// This class assumes that fields in both the load and store properties appear +// in the same bit positions. +class CsiLoadStoreProperty : public CsiProperty { +public: + CsiLoadStoreProperty() { PropValue.Bits = 0; } + /// Return the Type of a property. + static Type *getType(LLVMContext &C) { + // Must match the definition of property type in csi.h. + return CsiProperty::getCoercedType( + C, + StructType::get(IntegerType::get(C, PropBits.Alignment), + IntegerType::get(C, PropBits.IsVtableAccess), + IntegerType::get(C, PropBits.IsConstant), + IntegerType::get(C, PropBits.IsOnStack), + IntegerType::get(C, PropBits.MayBeCaptured), + IntegerType::get(C, PropBits.IsAtomic), + IntegerType::get(C, PropBits.IsThreadLocal), + IntegerType::get(C, PropBits.LoadReadBeforeWriteInBB), + IntegerType::get(C, PropBits.Padding))); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the Alignment property. + void setAlignment(const MaybeAlign A) { + if (unsigned EncAlign = encode(A)) + PropValue.Fields.Alignment = 1 << (EncAlign - 1); + else + PropValue.Fields.Alignment = 0; + } + void setAlignment(const Align A) { + if (unsigned EncAlign = encode(A)) + PropValue.Fields.Alignment = 1 << (EncAlign - 1); + else + PropValue.Fields.Alignment = 0; + } + /// Set the value of the IsVtableAccess property. + void setIsVtableAccess(bool v) { PropValue.Fields.IsVtableAccess = v; } + /// Set the value of the IsConstant property. + void setIsConstant(bool v) { PropValue.Fields.IsConstant = v; } + /// Set the value of the IsOnStack property. + void setIsOnStack(bool v) { PropValue.Fields.IsOnStack = v; } + /// Set the value of the MayBeCaptured property. + void setMayBeCaptured(bool v) { PropValue.Fields.MayBeCaptured = v; } + /// Set the value of the IsAtomic property. + void setIsAtomic(bool v) { PropValue.Fields.IsAtomic = v; } + /// Set the value of the IsThreadLocal property. + void setIsThreadLocal(bool v) { PropValue.Fields.IsThreadLocal = v; } + /// Set the value of the LoadReadBeforeWriteInBB property. + void setLoadReadBeforeWriteInBB(bool v) { + PropValue.Fields.LoadReadBeforeWriteInBB = v; + } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned Alignment : 8; + unsigned IsVtableAccess : 1; + unsigned IsConstant : 1; + unsigned IsOnStack : 1; + unsigned MayBeCaptured : 1; + unsigned IsAtomic : 1; + unsigned IsThreadLocal : 1; + unsigned LoadReadBeforeWriteInBB : 1; + uint64_t Padding : 49; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int Alignment; + int IsVtableAccess; + int IsConstant; + int IsOnStack; + int MayBeCaptured; + int IsAtomic; + int IsThreadLocal; + int LoadReadBeforeWriteInBB; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = { + 8, 1, 1, 1, 1, 1, 1, 1, (64 - 8 - 1 - 1 - 1 - 1 - 1 - 1 - 1)}; +}; + +class CsiAllocaProperty : public CsiProperty { +public: + CsiAllocaProperty() { PropValue.Bits = 0; } + + /// Return the Type of a property. + static Type *getType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return CsiProperty::getCoercedType( + C, StructType::get(IntegerType::get(C, PropBits.IsStatic), + IntegerType::get(C, PropBits.Padding))); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the IsIndirect property. + void setIsStatic(bool v) { PropValue.Fields.IsStatic = v; } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned IsStatic : 1; + uint64_t Padding : 63; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int IsStatic; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = {1, (64 - 1)}; +}; + +class CsiAllocFnProperty : public CsiProperty { +public: + CsiAllocFnProperty() { PropValue.Bits = 0; } + /// Return the Type of a property. + static Type *getType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return CsiProperty::getCoercedType( + C, StructType::get(IntegerType::get(C, PropBits.AllocFnTy), + IntegerType::get(C, PropBits.Padding))); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the allocation function type (e.g., malloc, calloc, new). + void setAllocFnTy(unsigned v) { PropValue.Fields.AllocFnTy = v; } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned AllocFnTy : 8; + uint64_t Padding : 56; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int AllocFnTy; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = {8, (64 - 8)}; +}; + +class CsiFreeProperty : public CsiProperty { +public: + CsiFreeProperty() { PropValue.Bits = 0; } + /// Return the Type of a property. + static Type *getType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return CsiProperty::getCoercedType( + C, StructType::get(IntegerType::get(C, PropBits.FreeTy), + IntegerType::get(C, PropBits.Padding))); + } + /// Get the default value for this property. + static Constant *getDefaultValueImpl(LLVMContext &C) { + return Constant::getNullValue(getType(C)); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the allocation function type (e.g., malloc, calloc, new). + void setFreeTy(unsigned v) { PropValue.Fields.FreeTy = v; } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned FreeTy : 8; + uint64_t Padding : 56; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int FreeTy; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = {8, (64 - 8)}; +}; + +struct CSISetupImpl { +public: + CSISetupImpl(Module &M, const CSIOptions &Options = CSIOptions()) + : M(M), Options(Options) {} + + bool run(); + +private: + bool setupFunction(Function &F); + + Module &M; + CSIOptions Options; +}; + +struct CSIImpl { +public: + CSIImpl(Module &M, CallGraph *CG, + function_ref GetDomTree, + function_ref GetLoopInfo, + function_ref GetTaskInfo, + function_ref GetTLI, + function_ref GetSE, + function_ref GetTTI, + const CSIOptions &Options = CSIOptions()) + : M(M), DL(M.getDataLayout()), CG(CG), GetDomTree(GetDomTree), + GetLoopInfo(GetLoopInfo), GetTaskInfo(GetTaskInfo), GetTLI(GetTLI), + GetScalarEvolution(GetSE), GetTTI(GetTTI), Options(Options) { + loadConfiguration(); + } + CSIImpl(Module &M, CallGraph *CG, + function_ref GetDomTree, + function_ref GetLoopInfo, + function_ref GetTaskInfo, + function_ref GetTLI, + const CSIOptions &Options = CSIOptions()) + : M(M), DL(M.getDataLayout()), CG(CG), GetDomTree(GetDomTree), + GetLoopInfo(GetLoopInfo), GetTaskInfo(GetTaskInfo), GetTLI(GetTLI), + Options(Options) { + loadConfiguration(); + } + + virtual ~CSIImpl() {} + + bool run(); + + /// Get the number of bytes accessed via the given address. + static int getNumBytesAccessed(Type *OrigTy, const DataLayout &DL); + + /// Members to extract properties of loads/stores. + static bool isVtableAccess(const Instruction *I); + static bool addrPointsToConstantData(const Value *Addr); + static bool isAtomic(const Instruction *I); + static bool isThreadLocalObject(const Value *Obj); + static bool isAllocFn(const Instruction *I, const TargetLibraryInfo *TLI); + static bool isAllocFn(const Value *V, const TargetLibraryInfo *TLI) { + if (const CallBase *CB = dyn_cast(V)) + return isAllocFn(CB, TLI); + return false; + } + static bool getAllocFnArgs(const Instruction *I, + SmallVectorImpl &AllocFnArgs, + Type *SizeTy, Type *AddrTy, + const TargetLibraryInfo &TLI); + static bool isFreeFn(const Instruction *I, const TargetLibraryInfo *TLI); + + /// Helper functions to set up the CFG for CSI instrumentation. + static void setupCalls(Function &F); + static void setupBlocks(Function &F, const TargetLibraryInfo *TLI, + DominatorTree *DT = nullptr, LoopInfo *LI = nullptr); + static void splitBlocksAtCalls(Function &F, DominatorTree *DT = nullptr, + LoopInfo *LI = nullptr); + + /// Helper function that identifies calls or invokes of placeholder functions, + /// such as debug-info intrinsics or lifetime intrinsics. + static bool callsPlaceholderFunction(const Instruction &I); + + static Constant *getDefaultID(IRBuilder<> &IRB) { + return IRB.getInt64(CsiUnknownId); + } + + static bool spawnsTapirLoopBody(DetachInst *DI, LoopInfo &LI, TaskInfo &TI); + + static BasicBlock::iterator + getFirstInsertionPtInDetachedBlock(BasicBlock *Detached); + + // Return true if BB is an entry block to a function or task, false otherwise. + static bool isEntryBlock(const BasicBlock &BB, const TaskInfo &TI); + +protected: + /// Initialize the CSI pass. + void initializeCsi(); + /// Finalize the CSI pass. + void finalizeCsi(); + + /// Initialize FunctionCallees for the CSI hooks. + /// @{ + void initializeLoadStoreHooks(); + void initializeFuncHooks(); + void initializeBasicBlockHooks(); + void initializeLoopHooks(); + void initializeCallsiteHooks(); + void initializeAllocaHooks(); + void initializeMemIntrinsicsHooks(); + void initializeTapirHooks(); + void initializeAllocFnHooks(); + /// @} + + static StructType *getUnitFedTableType(LLVMContext &C, + PointerType *EntryPointerType); + static Constant *fedTableToUnitFedTable(Module &M, + StructType *UnitFedTableType, + FrontEndDataTable &FedTable); + static StructType *getUnitSizeTableType(LLVMContext &C, + PointerType *EntryPointerType); + static Constant *sizeTableToUnitSizeTable(Module &M, + StructType *UnitSizeTableType, + SizeTable &SzTable); + /// Initialize the front-end data table structures. + void initializeFEDTables(); + /// Collect unit front-end data table structures for finalization. + void collectUnitFEDTables(); + /// Initialize the front-end data table structures. + void initializeSizeTables(); + /// Collect unit front-end data table structures for finalization. + void collectUnitSizeTables(); + + virtual CallInst *createRTUnitInitCall(IRBuilder<> &IRB); + + // Get the local ID of the given function. + uint64_t getLocalFunctionID(Function &F); + /// Generate a function that stores global function IDs into a set + /// of externally-visible global variables. + void generateInitCallsiteToFunction(); + + Instruction *getEntryBBInsertPt(BasicBlock &BB); + + /// Compute CSI properties on the given ordered list of loads and stores. + void computeLoadAndStoreProperties( + SmallVectorImpl> + &LoadAndStoreProperties, + SmallVectorImpl &BBLoadsAndStores); + + /// Insert calls to the instrumentation hooks. + /// @{ + void addLoadStoreInstrumentation(Instruction *I, FunctionCallee BeforeFn, + FunctionCallee AfterFn, Value *CsiId, + Type *AddrType, Value *Addr, int NumBytes, + CsiLoadStoreProperty &Prop); + void instrumentLoadOrStore(Instruction *I, CsiLoadStoreProperty &Prop); + void instrumentAtomic(Instruction *I); + bool instrumentMemIntrinsic(Instruction *I); + void instrumentCallsite(Instruction *I, DominatorTree *DT); + void instrumentBasicBlock(BasicBlock &BB, const TaskInfo &TI); + void instrumentLoop(Loop &L, TaskInfo &TI, ScalarEvolution *SE); + + void instrumentDetach(DetachInst *DI, unsigned SyncRegNum, + unsigned NumSyncRegs, DominatorTree *DT, TaskInfo &TI, + LoopInfo &LI); + void instrumentSync(SyncInst *SI, unsigned SyncRegNum); + void instrumentAlloca(Instruction *I, TaskInfo &TI); + void instrumentAllocFn(Instruction *I, DominatorTree *DT, + const TargetLibraryInfo *TLI); + void instrumentFree(Instruction *I, const TargetLibraryInfo *TLI); + + void interposeCall(Instruction *I); + + void instrumentFunction(Function &F); + /// @} + + /// Obtain the signature for the interposition function given the + /// original function that needs interpositioning. + Function *getInterpositionFunction(Function *F); + + /// Insert a call to the given hook function before the given instruction. + CallInst* insertHookCall(Instruction *I, FunctionCallee HookFunction, + ArrayRef HookArgs); + bool updateArgPHIs(BasicBlock *Succ, BasicBlock *BB, + FunctionCallee HookFunction, ArrayRef HookArgs, + ArrayRef DefaultHookArgs); + CallInst *insertHookCallInSuccessorBB(BasicBlock *Succ, BasicBlock *BB, + FunctionCallee HookFunction, + ArrayRef HookArgs, + ArrayRef DefaultHookArgs); + void insertHookCallAtSharedEHSpindleExits(Spindle *SharedEHSpindle, Task *T, + FunctionCallee HookFunction, + FrontEndDataTable &FED, + ArrayRef HookArgs, + ArrayRef DefaultArgs); + + /// Return true if the given function should not be instrumented. + static bool shouldNotInstrumentFunction(Function &F); + + // Update the attributes on the instrumented function that might be + // invalidated by the inserted instrumentation. + void updateInstrumentedFnAttrs(Function &F); + // List of all allocation function types. This list needs to remain + // consistent with TargetLibraryInfo and with csi.h. + enum class AllocFnTy { + malloc = 0, + valloc, + calloc, + aligned_alloc, + realloc, + reallocf, + Znwj, + ZnwjRKSt9nothrow_t, + Znwm, + ZnwmRKSt9nothrow_t, + Znaj, + ZnajRKSt9nothrow_t, + Znam, + ZnamRKSt9nothrow_t, + msvc_new_int, + msvc_new_int_nothrow, + msvc_new_longlong, + msvc_new_longlong_nothrow, + msvc_new_array_int, + msvc_new_array_int_nothrow, + msvc_new_array_longlong, + msvc_new_array_longlong_nothrow, + ZnwjSt11align_val_t, + ZnwmSt11align_val_t, + ZnajSt11align_val_t, + ZnamSt11align_val_t, + ZnwjSt11align_val_tRKSt9nothrow_t, + ZnwmSt11align_val_tRKSt9nothrow_t, + ZnajSt11align_val_tRKSt9nothrow_t, + ZnamSt11align_val_tRKSt9nothrow_t, + posix_memalign, + strdup, + strndup, + LAST_ALLOCFNTY + }; + + static AllocFnTy getAllocFnTy(const LibFunc &F) { + switch (F) { + default: + return AllocFnTy::LAST_ALLOCFNTY; + case LibFunc_malloc: + return AllocFnTy::malloc; + case LibFunc_valloc: + return AllocFnTy::valloc; + case LibFunc_aligned_alloc: + // aligned_alloc(align_val_t, size_t) + return AllocFnTy::aligned_alloc; + case LibFunc_calloc: + return AllocFnTy::calloc; + case LibFunc_realloc: + return AllocFnTy::realloc; + case LibFunc_reallocf: + return AllocFnTy::reallocf; + case LibFunc_Znwj: + // new(unsigned int); + return AllocFnTy::Znwj; + case LibFunc_ZnwjRKSt9nothrow_t: + // new(unsigned int, nothrow); + return AllocFnTy::ZnwjRKSt9nothrow_t; + case LibFunc_Znwm: + // new(unsigned long); + return AllocFnTy::Znwm; + case LibFunc_ZnwmRKSt9nothrow_t: + // new(unsigned long, nothrow); + return AllocFnTy::ZnwmRKSt9nothrow_t; + case LibFunc_Znaj: + // new[](unsigned int); + return AllocFnTy::Znaj; + case LibFunc_ZnajRKSt9nothrow_t: + // new[](unsigned int, nothrow); + return AllocFnTy::ZnajRKSt9nothrow_t; + case LibFunc_Znam: + // new[](unsigned long); + return AllocFnTy::Znam; + case LibFunc_ZnamRKSt9nothrow_t: + // new[](unsigned long, nothrow); + return AllocFnTy::ZnamRKSt9nothrow_t; + case LibFunc_msvc_new_int: + // new(unsigned int); + return AllocFnTy::msvc_new_int; + case LibFunc_msvc_new_int_nothrow: + // new(unsigned int, nothrow); + return AllocFnTy::msvc_new_int_nothrow; + case LibFunc_msvc_new_longlong: + // new(unsigned long long); + return AllocFnTy::msvc_new_longlong; + case LibFunc_msvc_new_longlong_nothrow: + // new(unsigned long long, nothrow); + return AllocFnTy::msvc_new_longlong_nothrow; + case LibFunc_msvc_new_array_int: + // new[](unsigned int); + return AllocFnTy::msvc_new_array_int; + case LibFunc_msvc_new_array_int_nothrow: + // new[](unsigned int, nothrow); + return AllocFnTy::msvc_new_array_int_nothrow; + case LibFunc_msvc_new_array_longlong: + // new[](unsigned long long); + return AllocFnTy::msvc_new_array_longlong; + case LibFunc_msvc_new_array_longlong_nothrow: + // new[](unsigned long long, nothrow); + return AllocFnTy::msvc_new_array_longlong_nothrow; + case LibFunc_ZnwjSt11align_val_t: + // new(unsigned int, align_val_t) + return AllocFnTy::ZnwjSt11align_val_t; + case LibFunc_ZnwmSt11align_val_t: + // new(unsigned long, align_val_t) + return AllocFnTy::ZnwmSt11align_val_t; + case LibFunc_ZnajSt11align_val_t: + // new[](unsigned int, align_val_t) + return AllocFnTy::ZnajSt11align_val_t; + case LibFunc_ZnamSt11align_val_t: + // new[](unsigned long, align_val_t) + return AllocFnTy::ZnamSt11align_val_t; + case LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t: + // new(unsigned int, align_val_t, nothrow) + return AllocFnTy::ZnwjSt11align_val_tRKSt9nothrow_t; + case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t: + // new(unsigned long, align_val_t, nothrow) + return AllocFnTy::ZnwmSt11align_val_tRKSt9nothrow_t; + case LibFunc_ZnajSt11align_val_tRKSt9nothrow_t: + // new[](unsigned int, align_val_t, nothrow) + return AllocFnTy::ZnajSt11align_val_tRKSt9nothrow_t; + case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t: + // new[](unsigned long, align_val_t, nothrow) + return AllocFnTy::ZnamSt11align_val_tRKSt9nothrow_t; + case LibFunc_posix_memalign: + // posix_memalign(void **, size_t, size_t) + return AllocFnTy::posix_memalign; + case LibFunc_strdup: + // strdup(const char *) + return AllocFnTy::strdup; + case LibFunc_strndup: + // strdup(const char *, size_t) + return AllocFnTy::strndup; + } + } + + // List of all free function types. This list needs to remain consistent with + // TargetLibraryInfo and with csi.h. + enum class FreeTy { + free = 0, + ZdlPv, + ZdlPvRKSt9nothrow_t, + ZdlPvj, + ZdlPvm, + ZdaPv, + ZdaPvRKSt9nothrow_t, + ZdaPvj, + ZdaPvm, + msvc_delete_ptr32, + msvc_delete_ptr32_nothrow, + msvc_delete_ptr32_int, + msvc_delete_ptr64, + msvc_delete_ptr64_nothrow, + msvc_delete_ptr64_longlong, + msvc_delete_array_ptr32, + msvc_delete_array_ptr32_nothrow, + msvc_delete_array_ptr32_int, + msvc_delete_array_ptr64, + msvc_delete_array_ptr64_nothrow, + msvc_delete_array_ptr64_longlong, + ZdlPvSt11align_val_t, + ZdlPvSt11align_val_tRKSt9nothrow_t, + ZdaPvSt11align_val_t, + ZdaPvSt11align_val_tRKSt9nothrow_t, + LAST_FREETY + }; + + static FreeTy getFreeTy(const LibFunc &F) { + switch (F) { + default: + return FreeTy::LAST_FREETY; + case LibFunc_free: + return FreeTy::free; + case LibFunc_ZdlPv: + // void operator delete(void*); + return FreeTy::ZdlPv; + case LibFunc_ZdlPvRKSt9nothrow_t: + // void operator delete(void*, nothrow); + return FreeTy::ZdlPvRKSt9nothrow_t; + case LibFunc_ZdlPvj: + // void operator delete(void*, unsigned int); + return FreeTy::ZdlPvj; + case LibFunc_ZdlPvm: + // void operator delete(void*, unsigned long); + return FreeTy::ZdlPvm; + case LibFunc_ZdaPv: + // void operator delete[](void*); + return FreeTy::ZdaPv; + case LibFunc_ZdaPvRKSt9nothrow_t: + // void operator delete[](void*, nothrow); + return FreeTy::ZdaPvRKSt9nothrow_t; + case LibFunc_ZdaPvj: + // void operator delete[](void*, unsigned int); + return FreeTy::ZdaPvj; + case LibFunc_ZdaPvm: + // void operator delete[](void*, unsigned long); + return FreeTy::ZdaPvm; + case LibFunc_msvc_delete_ptr32: + // void operator delete(void*); + return FreeTy::msvc_delete_ptr32; + case LibFunc_msvc_delete_ptr32_nothrow: + // void operator delete(void*, nothrow); + return FreeTy::msvc_delete_ptr32_nothrow; + case LibFunc_msvc_delete_ptr32_int: + // void operator delete(void*, unsigned int); + return FreeTy::msvc_delete_ptr32_int; + case LibFunc_msvc_delete_ptr64: + // void operator delete(void*); + return FreeTy::msvc_delete_ptr64; + case LibFunc_msvc_delete_ptr64_nothrow: + // void operator delete(void*, nothrow); + return FreeTy::msvc_delete_ptr64_nothrow; + case LibFunc_msvc_delete_ptr64_longlong: + // void operator delete(void*, unsigned long long); + return FreeTy::msvc_delete_ptr64_longlong; + case LibFunc_msvc_delete_array_ptr32: + // void operator delete[](void*); + return FreeTy::msvc_delete_array_ptr32; + case LibFunc_msvc_delete_array_ptr32_nothrow: + // void operator delete[](void*, nothrow); + return FreeTy::msvc_delete_array_ptr32_nothrow; + case LibFunc_msvc_delete_array_ptr32_int: + // void operator delete[](void*, unsigned int); + return FreeTy::msvc_delete_array_ptr32_int; + case LibFunc_msvc_delete_array_ptr64: + // void operator delete[](void*); + return FreeTy::msvc_delete_array_ptr64; + case LibFunc_msvc_delete_array_ptr64_nothrow: + // void operator delete[](void*, nothrow); + return FreeTy::msvc_delete_array_ptr64_nothrow; + case LibFunc_msvc_delete_array_ptr64_longlong: + // void operator delete[](void*, unsigned long long); + return FreeTy::msvc_delete_array_ptr64_longlong; + case LibFunc_ZdlPvSt11align_val_t: + // void operator delete(void*, align_val_t) + return FreeTy::ZdlPvSt11align_val_t; + case LibFunc_ZdlPvSt11align_val_tRKSt9nothrow_t: + // void operator delete(void*, align_val_t, nothrow) + return FreeTy::ZdlPvSt11align_val_tRKSt9nothrow_t; + case LibFunc_ZdaPvSt11align_val_t: + // void operator delete[](void*, align_val_t) + return FreeTy::ZdaPvSt11align_val_t; + case LibFunc_ZdaPvSt11align_val_tRKSt9nothrow_t: + // void operator delete[](void*, align_val_t, nothrow) + return FreeTy::ZdaPvSt11align_val_tRKSt9nothrow_t; + } + } + + void linkInToolFromBitcode(const std::string &BitcodePath); + void loadConfiguration(); + + Module &M; + const DataLayout &DL; + CallGraph *CG; + function_ref GetDomTree; + function_ref GetLoopInfo; + function_ref GetTaskInfo; + function_ref GetTLI; + std::optional> GetScalarEvolution; + std::optional> GetTTI; + CSIOptions Options; + + FrontEndDataTable FunctionFED, FunctionExitFED, BasicBlockFED, LoopFED, + LoopExitFED, CallsiteFED, LoadFED, StoreFED, AllocaFED, DetachFED, + TaskFED, TaskExitFED, DetachContinueFED, SyncFED, AllocFnFED, FreeFED; + + SmallVector UnitFedTables; + + SizeTable BBSize; + SmallVector UnitSizeTables; + + // Instrumentation hooks + FunctionCallee CsiFuncEntry = nullptr, CsiFuncExit = nullptr; + FunctionCallee CsiBBEntry = nullptr, CsiBBExit = nullptr; + FunctionCallee CsiBeforeCallsite = nullptr, CsiAfterCallsite = nullptr; + FunctionCallee CsiBeforeLoop = nullptr, CsiAfterLoop = nullptr; + FunctionCallee CsiLoopBodyEntry = nullptr, CsiLoopBodyExit = nullptr; + FunctionCallee CsiBeforeRead = nullptr, CsiAfterRead = nullptr; + FunctionCallee CsiBeforeWrite = nullptr, CsiAfterWrite = nullptr; + FunctionCallee CsiAfterAlloca = nullptr; + FunctionCallee CsiDetach = nullptr, CsiDetachContinue = nullptr; + FunctionCallee CsiTaskEntry = nullptr, CsiTaskExit = nullptr; + FunctionCallee CsiBeforeSync = nullptr, CsiAfterSync = nullptr; + FunctionCallee CsiBeforeAllocFn = nullptr, CsiAfterAllocFn = nullptr; + FunctionCallee CsiBeforeFree = nullptr, CsiAfterFree = nullptr; + + FunctionCallee MemmoveFn = nullptr, MemcpyFn = nullptr, MemsetFn = nullptr; + Function *InitCallsiteToFunction = nullptr; + // GlobalVariable *DisableInstrGV; + + // Runtime unit initialization + FunctionCallee RTUnitInit = nullptr; + + Type *IntptrTy; + DenseMap FuncOffsetMap; + + DenseMap, + SmallVector> ArgPHIs; + SmallPtrSet SyncsWithUnwinds; + DenseMap EntryBBInsertPt; + + std::unique_ptr Config; + + // Declarations of interposition functions. + DenseMap InterpositionFunctions; + + bool LinkedBitcode = false; + SmallSet FunctionsInBitcode; + SmallPtrSet LinkedFromBitcode; + + // // Cached results of calls to GetUnderlyingObject. + // using UnderlyingObjMapTy = DenseMap; + // mutable UnderlyingObjMapTy UnderlyingObject; + Value *lookupUnderlyingObject(Value *Addr) const; + + friend struct CSISetupImpl; +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_CSI_H diff --git a/llvm/include/llvm/Transforms/Instrumentation/CilkSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/CilkSanitizer.h new file mode 100644 index 000000000000000..f53ace2c9c3f89b --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/CilkSanitizer.h @@ -0,0 +1,34 @@ +//===- CilkSanitizer.h ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file is part of CilkSan, a determinacy-race detector for Cilk and Tapir +/// programs. +/// +/// This instrumentation pass inserts calls to the CilkSan runtime library +/// before appropriate memory accesses. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_CILKSANITIZER_H +#define LLVM_TRANSFORMS_CILKSANITIZER_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Instrumentation.h" + +namespace llvm { + +/// CilkSanitizer pass for new pass manager. +class CilkSanitizerPass : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_CILKSANITIZER_H diff --git a/llvm/include/llvm/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.h b/llvm/include/llvm/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.h new file mode 100644 index 000000000000000..0c81c35a36e8877 --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.h @@ -0,0 +1,48 @@ +//===- ComprehensiveStaticInstrumentation.h ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file is part of CSI, a framework that provides comprehensive static +/// instrumentation. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_COMPREHENSIVESTATICINSTRUMENTATION_H +#define LLVM_TRANSFORMS_COMPREHENSIVESTATICINSTRUMENTATION_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Instrumentation.h" + +namespace llvm { + +/// CSISetup pass for new pass manager. +class CSISetupPass : public PassInfoMixin { +public: + CSISetupPass(); + CSISetupPass(const CSIOptions &Options); + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + +private: + CSIOptions Options; +}; + +/// ComprehensiveStaticInstrumentation pass for new pass manager. +class ComprehensiveStaticInstrumentationPass : + public PassInfoMixin { +public: + ComprehensiveStaticInstrumentationPass(); + ComprehensiveStaticInstrumentationPass(const CSIOptions &Options); + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + +private: + CSIOptions Options; +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_COMPREHENSIVESTATICINSTRUMENTATION_H diff --git a/llvm/include/llvm/Transforms/Instrumentation/SurgicalInstrumentationConfig.h b/llvm/include/llvm/Transforms/Instrumentation/SurgicalInstrumentationConfig.h new file mode 100644 index 000000000000000..403c320aee9dde5 --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/SurgicalInstrumentationConfig.h @@ -0,0 +1,156 @@ +//===-- SurgicalInstrumentationConfig.h -- Surgical CSI ------*- C++ -*----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is part of CSI, a framework that provides comprehensive static +// instrumentation. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_SURGICALINSTRUMENTATIONCONFIG_H +#define LLVM_TRANSFORMS_INSTRUMENTATION_SURGICALINSTRUMENTATIONCONFIG_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { +enum InstrumentationConfigMode { WHITELIST = 0, BLACKLIST = 1 }; + +enum InstrumentationPoint : int { + INSTR_INVALID_POINT = 0x0, + INSTR_FUNCTION_ENTRY = 0x1, + INSTR_FUNCTION_EXIT = 0x1 << 1, + INSTR_BEFORE_CALL = 0x1 << 2, + INSTR_AFTER_CALL = 0x1 << 3, + INSTR_TAPIR_DETACH = 0x1 << 4, + INSTR_TAPIR_SYNC = 0x1 << 5, +}; + +#define INSTR_ALL_POINTS InstrumentationPoint::INSTR_INVALID_POINT + +inline InstrumentationPoint operator|(const InstrumentationPoint &a, + const InstrumentationPoint &b) { + return static_cast(static_cast(a) | + static_cast(b)); +} + +inline InstrumentationPoint operator&(const InstrumentationPoint &a, + const InstrumentationPoint &b) { + return static_cast(static_cast(a) & + static_cast(b)); +} + +inline bool operator==(InstrumentationPoint a, InstrumentationPoint b) { + return static_cast(a) == static_cast(b); +} + +inline InstrumentationPoint &operator|=(InstrumentationPoint &a, + InstrumentationPoint b) { + return a = a | b; +} + +static StringMap SurgicalInstrumentationPoints = { + {"FunctionEntry", INSTR_FUNCTION_ENTRY}, + { + "FunctionExit", + INSTR_FUNCTION_EXIT, + }, + { + "BeforeCall", + INSTR_BEFORE_CALL, + }, + { + "AfterCall", + INSTR_AFTER_CALL, + }, + { + "TapirDetach", + INSTR_TAPIR_DETACH, + }, + { + "TapirSync", + INSTR_TAPIR_SYNC, + }, +}; + +InstrumentationPoint +ParseInstrumentationPoint(const StringRef &instrPointString); + +class InstrumentationConfig { +public: + virtual ~InstrumentationConfig() {} + + void SetConfigMode(InstrumentationConfigMode mode) { this->mode = mode; } + + static std::unique_ptr GetDefault(); + + static std::unique_ptr + ReadFromConfigurationFile(const std::string &filename); + + virtual bool DoesFunctionRequireInterposition(const StringRef &functionName) { + return interposedFunctions.find(functionName) != interposedFunctions.end(); + } + + virtual bool DoesAnyFunctionRequireInterposition() { + return interposedFunctions.size() > 0; + } + + virtual bool DoesFunctionRequireInstrumentationForPoint( + const StringRef &functionName, const InstrumentationPoint &point) { + if (targetFunctions.size() == 0) + return true; + + bool found = targetFunctions.find(functionName) != targetFunctions.end(); + + if (found) // The function is in the configuration. Does it specify this + // instrumentation point? + { + InstrumentationPoint &functionPoints = targetFunctions[functionName]; + + if (functionPoints != INSTR_ALL_POINTS) { + if ((targetFunctions[functionName] & point) != point) + found = false; + } + } + + return mode == InstrumentationConfigMode::WHITELIST ? found : !found; + } + +protected: + InstrumentationConfig(){}; + InstrumentationConfig(const StringMap &targetFunctions, + const StringSet<> &interposedFunctions) + : targetFunctions(targetFunctions), + interposedFunctions(interposedFunctions) {} + + StringMap targetFunctions; + + StringSet<> interposedFunctions; + + InstrumentationConfigMode mode = InstrumentationConfigMode::WHITELIST; +}; + +class DefaultInstrumentationConfig : public InstrumentationConfig { +public: + virtual bool DoesFunctionRequireInstrumentationForPoint( + const StringRef &functionName, + const InstrumentationPoint &point) override { + return true; + } + + virtual bool DoesAnyFunctionRequireInterposition() override { return false; } + + virtual bool + DoesFunctionRequireInterposition(const StringRef &functionName) override { + return false; + } +}; +} // namespace llvm + +#endif // LLVM_TRANSFORMS_INSTRUMENTATION_SURGICALINSTRUMENTATIONCONFIG_H diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h index debe2ee7991728e..a7ca8b0b64fa94e 100644 --- a/llvm/include/llvm/Transforms/Scalar/GVN.h +++ b/llvm/include/llvm/Transforms/Scalar/GVN.h @@ -53,6 +53,7 @@ class NonLocalDepResult; class OptimizationRemarkEmitter; class PHINode; class TargetLibraryInfo; +class TaskInfo; class Value; /// A private "module" namespace for types and utilities used by GVN. These /// are implementation details and should not be used by clients. @@ -316,7 +317,8 @@ class GVNPass : public PassInfoMixin { bool runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, const TargetLibraryInfo &RunTLI, AAResults &RunAA, MemoryDependenceResults *RunMD, LoopInfo &LI, - OptimizationRemarkEmitter *ORE, MemorySSA *MSSA = nullptr); + OptimizationRemarkEmitter *ORE, TaskInfo *TI = nullptr, + MemorySSA *MSSA = nullptr); // List of critical edges to be split between iterations. SmallVector, 4> toSplit; diff --git a/llvm/include/llvm/Transforms/Scalar/IndVarSimplify.h b/llvm/include/llvm/Transforms/Scalar/IndVarSimplify.h index b5d544f1149c6dd..0870101a306cd28 100644 --- a/llvm/include/llvm/Transforms/Scalar/IndVarSimplify.h +++ b/llvm/include/llvm/Transforms/Scalar/IndVarSimplify.h @@ -32,6 +32,17 @@ class IndVarSimplifyPass : public PassInfoMixin { LoopStandardAnalysisResults &AR, LPMUpdater &U); }; +class TapirIndVarSimplifyPass : public PassInfoMixin { + /// Perform IV widening during the pass. + bool WidenIndVars; + +public: + TapirIndVarSimplifyPass(bool WidenIndVars = true) + : WidenIndVars(WidenIndVars) {} + PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; + } // end namespace llvm #endif // LLVM_TRANSFORMS_SCALAR_INDVARSIMPLIFY_H diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h index a3f2ce23f7d9aa3..7815cdbfec17134 100644 --- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h +++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h @@ -93,6 +93,8 @@ class JumpThreadingPass : public PassInfoMixin { #else SmallSet, 16> LoopHeaders; #endif + DenseMap, SmallPtrSet> + TapirTasks; unsigned BBDupThreshold; unsigned DefaultBBDupThreshold; @@ -112,6 +114,7 @@ class JumpThreadingPass : public PassInfoMixin { DomTreeUpdater *getDomTreeUpdater() const { return DTU.get(); } void findLoopHeaders(Function &F); + void findTapirTasks(Function &F, DominatorTree &DT); bool processBlock(BasicBlock *BB); bool maybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB); void updateSSA(BasicBlock *BB, BasicBlock *NewBB, diff --git a/llvm/include/llvm/Transforms/Scalar/SROA.h b/llvm/include/llvm/Transforms/Scalar/SROA.h index c03cdf48fb1c686..b0217b51e13132e 100644 --- a/llvm/include/llvm/Transforms/Scalar/SROA.h +++ b/llvm/include/llvm/Transforms/Scalar/SROA.h @@ -20,12 +20,115 @@ namespace llvm { class Function; +class LLVMContext; +class PHINode; +class SelectInst; +class TaskInfo; +class Use; + +/// A private "module" namespace for types and utilities used by SROA. These +/// are implementation details and should not be used by clients. +namespace LLVM_LIBRARY_VISIBILITY sroa { + +class AllocaSliceRewriter; +class AllocaSlices; +class Partition; +class SROALegacyPass; + +class SelectHandSpeculativity { + unsigned char Storage = 0; // None are speculatable by default. + using TrueVal = Bitfield::Element; // Low 0'th bit. + using FalseVal = Bitfield::Element; // Low 1'th bit. +public: + SelectHandSpeculativity() = default; + SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal); + bool isSpeculatable(bool isTrueVal) const; + bool areAllSpeculatable() const; + bool areAnySpeculatable() const; + bool areNoneSpeculatable() const; + // For interop as int half of PointerIntPair. + explicit operator intptr_t() const { return static_cast(Storage); } + explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {} +}; +static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char)); + +using PossiblySpeculatableLoad = + PointerIntPair; +using UnspeculatableStore = StoreInst *; +using RewriteableMemOp = + std::variant; +using RewriteableMemOps = SmallVector; + +} // end namespace sroa enum class SROAOptions : bool { ModifyCFG, PreserveCFG }; class SROAPass : public PassInfoMixin { const SROAOptions PreserveCFG; + LLVMContext *C = nullptr; + DomTreeUpdater *DTU = nullptr; + AssumptionCache *AC = nullptr; + TaskInfo *TI = nullptr; + + /// Worklist of alloca instructions to simplify. + /// + /// Each alloca in the function is added to this. Each new alloca formed gets + /// added to it as well to recursively simplify unless that alloca can be + /// directly promoted. Finally, each time we rewrite a use of an alloca other + /// the one being actively rewritten, we add it back onto the list if not + /// already present to ensure it is re-visited. + SmallSetVector Worklist; + + /// A collection of instructions to delete. + /// We try to batch deletions to simplify code and make things a bit more + /// efficient. We also make sure there is no dangling pointers. + SmallVector DeadInsts; + + /// Post-promotion worklist. + /// + /// Sometimes we discover an alloca which has a high probability of becoming + /// viable for SROA after a round of promotion takes place. In those cases, + /// the alloca is enqueued here for re-processing. + /// + /// Note that we have to be very careful to clear allocas out of this list in + /// the event they are deleted. + SmallSetVector PostPromotionWorklist; + + /// A collection of alloca instructions we can directly promote. + std::vector PromotableAllocas; + + /// A worklist of PHIs to speculate prior to promoting allocas. + /// + /// All of these PHIs have been checked for the safety of speculation and by + /// being speculated will allow promoting allocas currently in the promotable + /// queue. + SmallSetVector SpeculatablePHIs; + + /// A worklist of select instructions to rewrite prior to promoting + /// allocas. + SmallMapVector SelectsToRewrite; + + /// Select instructions that use an alloca and are subsequently loaded can be + /// rewritten to load both input pointers and then select between the result, + /// allowing the load of the alloca to be promoted. + /// From this: + /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other + /// %V = load , ptr %P2 + /// to: + /// %V1 = load , ptr %Alloca -> will be mem2reg'd + /// %V2 = load , ptr %Other + /// %V = select i1 %cond, %V1, %V2 + /// + /// We can do this to a select if its only uses are loads + /// and if either the operand to the select can be loaded unconditionally, + /// or if we are allowed to perform CFG modifications. + /// If found an intervening bitcast with a single use of the load, + /// allow the promotion. + static std::optional + isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG); +>>>>>>> 05e7e8a2c040 (This is a squash of the OpenCilk repo onto LLVM 17's release branch. All credit) + public: /// If \p PreserveCFG is set, then the pass is not allowed to modify CFG /// in any way, even if it would update CFG analyses. @@ -36,6 +139,25 @@ class SROAPass : public PassInfoMixin { void printPipeline(raw_ostream &OS, function_ref MapClassName2PassName); + +private: + friend class sroa::AllocaSliceRewriter; + friend class sroa::SROALegacyPass; + + /// Helper used by both the public run method and by the legacy pass. + PreservedAnalyses runImpl(Function &F, DomTreeUpdater &RunDTU, + AssumptionCache &RunAC, TaskInfo &RunTI); + PreservedAnalyses runImpl(Function &F, DominatorTree &RunDT, + AssumptionCache &RunAC, TaskInfo &RunTI); + + bool presplitLoadsAndStores(AllocaInst &AI, sroa::AllocaSlices &AS); + AllocaInst *rewritePartition(AllocaInst &AI, sroa::AllocaSlices &AS, + sroa::Partition &P); + bool splitAlloca(AllocaInst &AI, sroa::AllocaSlices &AS); + std::pair runOnAlloca(AllocaInst &AI); + void clobberUse(Use &U); + bool deleteDeadInstructions(SmallPtrSetImpl &DeletedAllocas); + bool promoteAllocas(Function &F); }; } // end namespace llvm diff --git a/llvm/include/llvm/Transforms/Tapir.h b/llvm/include/llvm/Transforms/Tapir.h new file mode 100644 index 000000000000000..17a1ec26af12f96 --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir.h @@ -0,0 +1,67 @@ +//===- Tapir.h - Tapir Transformations --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header file defines prototypes for accessor functions that expose passes +// in the Tapir transformations library. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_TAPIR_H +#define LLVM_TRANSFORMS_TAPIR_H + +namespace llvm { +class Pass; +class ModulePass; +class FunctionPass; +enum class TapirTargetID; + +//===----------------------------------------------------------------------===// +// +// LoopSpawningTI - Create a loop spawning pass that uses Task Info. +// +Pass *createLoopSpawningTIPass(); + +//===----------------------------------------------------------------------===// +// +// LowerTapirToTarget - Lower Tapir constructs to a specified parallel runtime. +// +ModulePass *createLowerTapirToTargetPass(); + +//===----------------------------------------------------------------------===// +// +// TaskCanonicalize - Canonicalize Tapir tasks +// +FunctionPass *createTaskCanonicalizePass(); + +//===----------------------------------------------------------------------===// +// +// TaskSimplify - Simplify Tapir tasks +// +FunctionPass *createTaskSimplifyPass(); + +//===----------------------------------------------------------------------===// +// +// DRFScopedNoAlias - Add scoped-noalias information based on DRF assumption +// +FunctionPass *createDRFScopedNoAliasWrapperPass(); + +//===----------------------------------------------------------------------===// +// +// LoopStripMinePass - Stripmine Tapir loops +// +Pass *createLoopStripMinePass(int Count = -1); + +//===----------------------------------------------------------------------===// +// +// SerializeSmallTasksPass - Serialize small Tapir tasks +// +FunctionPass *createSerializeSmallTasksPass(); + +} // End llvm namespace + +#endif diff --git a/llvm/include/llvm/Transforms/Tapir/CudaABI.h b/llvm/include/llvm/Transforms/Tapir/CudaABI.h new file mode 100644 index 000000000000000..88ee58cc6f37dbf --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/CudaABI.h @@ -0,0 +1,91 @@ +//===- CudaABI.h - Interface to the Kitsune CUDA back end ------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Kitsune CUDA ABI to convert Tapir instructions to +// calls into the Kitsune runtime system for NVIDIA GPU code. +// +//===----------------------------------------------------------------------===// +#ifndef CUDA_ABI_H_ +#define CUDA_ABI_H_ + +#include "llvm/Transforms/Tapir/LoweringUtils.h" +#include "llvm/Transforms/Tapir/TapirLoopInfo.h" + +namespace llvm { + +class DataLayout; +class TargetMachine; + +class CudaABI : public TapirTarget { +public: + CudaABI(Module &M) : TapirTarget(M) {} + ~CudaABI() {} + Value *lowerGrainsizeCall(CallInst *GrainsizeCall) override final; + void lowerSync(SyncInst &SI) override final; + + void addHelperAttributes(Function &F) override final {} + bool preProcessFunction(Function &F, TaskInfo &TI, + bool ProcessingTapirLoops) override final; + void postProcessFunction(Function &F, + bool ProcessingTapirLoops) override final; + void postProcessHelper(Function &F) override final; + + void preProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final; + void postProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final; + void preProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final; + void postProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final; + void processSubTaskCall(TaskOutlineInfo &TOI, + DominatorTree &DT) override final; + + LoopOutlineProcessor * + getLoopOutlineProcessor(const TapirLoopInfo *TL) const override final; +}; + +class PTXLoop : public LoopOutlineProcessor { +private: + static unsigned NextKernelID; + unsigned MyKernelID; + Module PTXM; + TargetMachine *PTXTargetMachine; + GlobalVariable *PTXGlobal; + + FunctionCallee GetThreadIdx = nullptr; + FunctionCallee GetBlockIdx = nullptr; + FunctionCallee GetBlockDim = nullptr; + FunctionCallee KitsuneCUDAInit = nullptr; + FunctionCallee KitsuneGPUInitKernel = nullptr; + FunctionCallee KitsuneGPUInitField = nullptr; + FunctionCallee KitsuneGPUSetRunSize = nullptr; + FunctionCallee KitsuneGPURunKernel = nullptr; + FunctionCallee KitsuneGPUFinish = nullptr; +public: + PTXLoop(Module &M); + + void setupLoopOutlineArgs( + Function &F, ValueSet &HelperArgs, SmallVectorImpl &HelperInputs, + ValueSet &InputSet, const SmallVectorImpl &LCArgs, + const SmallVectorImpl &LCInputs, + const ValueSet &TLInputsFixed) + override final; + unsigned getIVArgIndex(const Function &F, const ValueSet &Args) const + override final; + unsigned getLimitArgIndex(const Function &F, const ValueSet &Args) const + override final; + void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, + ValueToValueMapTy &VMap) override final; + void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, + DominatorTree &DT) override final; +}; +} + +#endif diff --git a/llvm/include/llvm/Transforms/Tapir/DRFScopedNoAliasAA.h b/llvm/include/llvm/Transforms/Tapir/DRFScopedNoAliasAA.h new file mode 100644 index 000000000000000..d0735df1ee9cad8 --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/DRFScopedNoAliasAA.h @@ -0,0 +1,29 @@ +//===- DRFScopedNoAlias.h - DRF-based scoped-noalias metadata ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Adds scoped-noalias metadata to memory accesses based on Tapir's parallel +// control flow constructs and the assumption that the function is data-race +// free. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_TAPIR_DRFSCOPEDNOALIASPASS_H +#define LLVM_TRANSFORMS_TAPIR_DRFSCOPEDNOALIASPASS_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +/// The DRF-Based Scoped-Noalias Pass. +struct DRFScopedNoAliasPass : public PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} + +#endif diff --git a/llvm/include/llvm/Transforms/Tapir/LambdaABI.h b/llvm/include/llvm/Transforms/Tapir/LambdaABI.h new file mode 100644 index 000000000000000..2fca38d533bfdd9 --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/LambdaABI.h @@ -0,0 +1,99 @@ +//===- LambdaABI.h - Generic interface to runtime systems -------*- C++ -*--=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the Lambda ABI to convert Tapir instructions to calls +// into a generic runtime system to operates on spawned computations as lambdas. +// +//===----------------------------------------------------------------------===// +#ifndef LAMBDA_ABI_H_ +#define LAMBDA_ABI_H_ + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Transforms/Tapir/LoweringUtils.h" + +namespace llvm { +class Value; +class TapirLoopInfo; + +class LambdaABI final : public TapirTarget { + ValueToValueMapTy DetachCtxToStackFrame; + + StringRef RuntimeBCPath = ""; + + // Runtime stack structure + StructType *StackFrameTy = nullptr; + FunctionType *SpawnBodyFnTy = nullptr; + Type *SpawnBodyFnArgTy = nullptr; + Type *SpawnBodyFnArgSizeTy = nullptr; + + // Runtime functions + FunctionCallee RTSEnterFrame = nullptr; + FunctionCallee RTSEnterHelperFrame = nullptr; + FunctionCallee RTSSpawn = nullptr; + FunctionCallee RTSLeaveFrame = nullptr; + FunctionCallee RTSLeaveHelperFrame = nullptr; + FunctionCallee RTSSync = nullptr; + FunctionCallee RTSSyncNoThrow = nullptr; + + FunctionCallee RTSLoopGrainsize8 = nullptr; + FunctionCallee RTSLoopGrainsize16 = nullptr; + FunctionCallee RTSLoopGrainsize32 = nullptr; + FunctionCallee RTSLoopGrainsize64 = nullptr; + + FunctionCallee RTSGetNumWorkers = nullptr; + FunctionCallee RTSGetWorkerID = nullptr; + + Align StackFrameAlign{8}; + + Value *CreateStackFrame(Function &F); + Value *GetOrCreateStackFrame(Function &F); + + CallInst *InsertStackFramePush(Function &F, + Instruction *TaskFrameCreate = nullptr, + bool Helper = false); + void InsertStackFramePop(Function &F, bool PromoteCallsToInvokes, + bool InsertPauseFrame, bool Helper); + +public: + LambdaABI(Module &M) : TapirTarget(M) {} + ~LambdaABI() { DetachCtxToStackFrame.clear(); } + + // void setOptions(const TapirTargetOptions &Options) override final; + + void prepareModule() override final; + Value *lowerGrainsizeCall(CallInst *GrainsizeCall) override final; + void lowerSync(SyncInst &SI) override final; + // void lowerReducerOperation(CallBase *CI) override; + + ArgStructMode getArgStructMode() const override final { + return ArgStructMode::Static; + } + void addHelperAttributes(Function &F) override final; + + bool preProcessFunction(Function &F, TaskInfo &TI, + bool ProcessingTapirLoops) override final; + void postProcessFunction(Function &F, + bool ProcessingTapirLoops) override final; + void postProcessHelper(Function &F) override final; + + void preProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final; + void postProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final; + void preProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final; + void postProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final; + void processSubTaskCall(TaskOutlineInfo &TOI, + DominatorTree &DT) override final; +}; +} // namespace llvm + +#endif // LAMBDA_ABI_H diff --git a/llvm/include/llvm/Transforms/Tapir/LoopSpawningTI.h b/llvm/include/llvm/Transforms/Tapir/LoopSpawningTI.h new file mode 100644 index 000000000000000..44a3bc808e559cb --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/LoopSpawningTI.h @@ -0,0 +1,27 @@ +//===- LoopSpawningTI.h - Spawn loop iterations efficiently -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass modifies Tapir loops to spawn their iterations efficiently. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_TAPIR_LOOPSPAWNING_H +#define LLVM_TRANSFORMS_TAPIR_LOOPSPAWNING_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +/// The LoopSpawning Pass. +struct LoopSpawningPass : public PassInfoMixin { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_TAPIR_LOOPSPAWNING_H diff --git a/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h b/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h new file mode 100644 index 000000000000000..270b77794620c7b --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h @@ -0,0 +1,54 @@ +//===- LoopStripMine.h - Tapir loop stripmining -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_TAPIR_LOOPSTRIPMINE_H +#define LLVM_TRANSFORMS_TAPIR_LOOPSTRIPMINE_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Support/InstructionCost.h" + +namespace llvm { + +class AssumptionCache; +class DominatorTree; +class Loop; +class LoopInfo; +class MDNode; +class OptimizationRemarkEmitter; +class ScalarEvolution; +class TargetLibraryInfo; +class TaskInfo; + +using NewLoopsMap = SmallDenseMap; + +void simplifyLoopAfterStripMine(Loop *L, bool SimplifyIVs, LoopInfo *LI, + ScalarEvolution *SE, DominatorTree *DT, + const TargetTransformInfo &TTI, + AssumptionCache *AC); + +TargetTransformInfo::StripMiningPreferences gatherStripMiningPreferences( + Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, + std::optional UserCount); + +bool computeStripMineCount(Loop *L, const TargetTransformInfo &TTI, + InstructionCost LoopCost, + TargetTransformInfo::StripMiningPreferences &UP); + +Loop *StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, + bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, + DominatorTree *DT, const TargetTransformInfo &TTI, + AssumptionCache *AC, TaskInfo *TI, + OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, + bool ParallelEpilog, bool NeedNestedSync, + Loop **Remainderloop = nullptr); + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_TAPIR_LOOPSTRIPMINE_H diff --git a/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h b/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h new file mode 100644 index 000000000000000..5b130c3e89d6241 --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h @@ -0,0 +1,32 @@ +//===- LoopStripMinePass.h - Tapir loop stripmining -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_TAPIR_LOOPSTRIPMINEPASS_H +#define LLVM_TRANSFORMS_TAPIR_LOOPSTRIPMINEPASS_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Support/CommandLine.h" + +namespace llvm { + +class Function; + +extern cl::opt EnableTapirLoopStripmine; + +/// Loop stripmining pass. It is a function pass to have access to function and +/// module analyses. +class LoopStripMinePass : public PassInfoMixin { +public: + explicit LoopStripMinePass() {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_TAPIR_LOOPSTRIPMINEPASS_H diff --git a/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h b/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h new file mode 100644 index 000000000000000..27fa8f2e465ce56 --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h @@ -0,0 +1,565 @@ +//===- LoweringUtils.h - Utility functions for lowering Tapir --*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements several utility functions for lowering Tapir. +// +//===----------------------------------------------------------------------===// + +#ifndef LOWERING_UTILS_H_ +#define LOWERING_UTILS_H_ + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Tapir/TapirTargetIDs.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +namespace llvm { + +class AAResults; +class AssumptionCache; +class BasicBlock; +class DominatorTree; +class Function; +class Loop; +class LoopOutlineProcessor; +class Spindle; +class TapirLoopInfo; +class Task; +class TaskInfo; +class Value; + +using ValueSet = SetVector; +using SpindleSet = SetVector; +using TaskValueSetMap = DenseMap; +using TFValueSetMap = DenseMap; + +struct OutlineAnalysis { + OutlineAnalysis(AAResults &AA, AssumptionCache &AC, DominatorTree &DT) + : AA(AA), AC(AC), DT(DT) { } + AAResults &AA; + AssumptionCache &AC; + DominatorTree &DT; +}; + +/// Structure that captures relevant information about an outlined task, +/// including the following: +/// -) A pointer to the outlined function. +/// -) The inputs passed to the call or invoke of that outlined function. +/// -) Pointers to the instructions that replaced the detach in the parent +/// function, ending with the call or invoke instruction to the outlined +/// function. +/// -) The normal and unwind destinations of the call or invoke of the outlined +/// function. +struct TaskOutlineInfo { + // The outlined helper function. + Function *Outline = nullptr; + + // Instruction in Outline corresponding to the detach point. + Instruction *DetachPt = nullptr; + + // Instruction in Outline corresponding to the taskframe.create. + Instruction *TaskFrameCreate = nullptr; + + // The set of values in the caller passed to the helper function. These + // values might be passed directly to a call to the helper function, or they + // might be marshalled into a structure. + ValueSet InputSet; + + // Instruction denoting the start of the code in the caller that replaced the + // task or Tapir loop. + Instruction *ReplStart = nullptr; + + // Instruction denoting the call or invoke instruction in the caller that + // calls the outlined helper function. + Instruction *ReplCall = nullptr; + + // Basic block to which the call to the outlined helper function returns. + // For an outlined task, this block corresponds to the continuation block + // of the original detach instruction. For an outlined Tapir loop, this + // block corresponds to the normal exit block after the loop latch. + BasicBlock *ReplRet = nullptr; + + // Basic block denoting the unwind destination of an invocation of the + // outlined helper function. This block corresponds to the unwind block of + // the original detach instruction, or nullptr if the original detach had no + // unwind block. + BasicBlock *ReplUnwind = nullptr; + + // Pointer to the basic block corresponding with the entry of this outlined + // taskframe in the function from which this taskframe was outlined. This + // pointer is maintained to help Tapir targets use taskframe-entry blocks as + // keys for target-specific maps. + BasicBlock *OriginalTFEntry = nullptr; + + TaskOutlineInfo() = default; + TaskOutlineInfo(Function *Outline, BasicBlock *OriginalTFEntry, + Instruction *DetachPt, Instruction *TaskFrameCreate, + ValueSet &InputSet, Instruction *ReplStart, + Instruction *ReplCall, BasicBlock *ReplRet, + BasicBlock *ReplUnwind = nullptr) + : Outline(Outline), DetachPt(DetachPt), TaskFrameCreate(TaskFrameCreate), + InputSet(InputSet), ReplStart(ReplStart), ReplCall(ReplCall), + ReplRet(ReplRet), ReplUnwind(ReplUnwind), + OriginalTFEntry(OriginalTFEntry) {} + + // Replaces the stored call or invoke instruction to the outlined function + // with \p NewReplCall, and updates other information in this TaskOutlineInfo + // struct appropriately. + void replaceReplCall(Instruction *NewReplCall) { + if (ReplStart == ReplCall) + ReplStart = NewReplCall; + ReplCall = NewReplCall; + } + + // Helper routine to remap relevant TaskOutlineInfo values in the event, for + // instance, that these values are themselves outlined. + void remapOutlineInfo(ValueToValueMapTy &VMap, ValueToValueMapTy &InputMap) { + ReplStart = cast(VMap[ReplStart]); + ReplCall = cast(VMap[ReplCall]); + ReplRet = cast(VMap[ReplRet]); + if (ReplUnwind) + ReplUnwind = cast(VMap[ReplUnwind]); + + // Remap the contents of InputSet. + ValueSet NewInputSet; + for (Value *V : InputSet) { + if (VMap[V]) + NewInputSet.insert(VMap[V]); + else if (InputMap[V] && VMap[InputMap[V]]) + NewInputSet.insert(VMap[InputMap[V]]); + else + NewInputSet.insert(V); + } + InputSet = NewInputSet; + } +}; + +// Map from tasks to TaskOutlineInfo structures. +using TaskOutlineMapTy = DenseMap; +using TFOutlineMapTy = DenseMap; + +/// Abstract class for a parallel-runtime-system target for Tapir lowering. +/// +/// The majority of the Tapir-lowering infrastructure focuses on outlining Tapir +/// tasks into separate functions, which is a common lowering step for many +/// different back-ends. Most of the heavy-lifting for this outlining process +/// is handled by the lowering infrastructure itself, implemented in +/// TapirToTarget and LoweringUtils. The TapirTarget class defines several +/// callbacks to tailor this lowering process for a particular back-end. +/// +/// The high-level Tapir-lowering algorithm, including the TapirTarget +/// callbacks, operates as follows: +/// +/// 1) For each Function F in the Module, call +/// TapirTarget::shouldProcessFunction(F) to decide whether to enqueue F for +/// processing. +/// +/// 2) Process each enqueued Function F as follows: +/// +/// a) Run TapirTarget::preProcessFunction(F). +/// +/// b) If TapirTarget::shouldDoOutlining(F) returns false, skip the subsequent +/// outlining steps, and only process grainsize calls, task-frameaddress +/// calls, and sync instructions in F. +/// +/// c) For each Tapir task T in F in post-order: +/// +/// i) Prepare the set of inputs to a helper function for T, using the +/// return value of OutlineProcessor::getArgStructMode() to guide this +/// preparation. For example, if getArgStructMode() != None, insert code to +/// allocate a structure and marshal the inputs in that structure. +/// +/// ii) Outline T into a new Function Helper, using the set of inputs +/// prepared in step 2ci and a constant NULL return value of type +/// TapirTarget::getReturnType(). +/// +/// iii) Run TapirTarget::addHelperAttributes(Helper). +/// +/// d) Let Helper[T] denote the outlined Function for a task T. +/// +/// e) For each Tapir task T in F in post-order: +/// +/// i) Run TapirTarget::preProcessOutlinedTask(Helper[T]). +/// +/// ii) For each subtask SubT spawned by Helper[T], run +/// TapirTarget::processSubTaskCall(Helper[SubT]) +/// +/// iii) Run TapirTarget::postProcessOutlinedTask(Helper[T]). +/// +/// iv) Process the grainsize calls, task-frameaddress calls, and sync +/// instructions in Helper[T]. +/// +/// e) If F spawns tasks, run TapirTarget::preProcessRootSpawner(F); then, for +/// each child task T of F, run TapirTarget::processSubTaskCall(Helper[T]); +/// and finally run TapirTarget::postProcessRootSpawner(F). +/// +/// f) Process the grainsize calls, task-frameaddress calls, and sync +/// instructions in F. +/// +/// g) Run TapirTarget::postProcessFunction(F). +/// +/// h) For each generated helper Function Helper, run +/// TapirTarget::postProcessHelper(Helper). +class TapirTarget { +protected: + /// The Module of the original Tapir code. + Module &M; + /// The Module into which the outlined Helper functions will be placed. + Module &DestM; + + TapirTarget(Module &M, Module &DestM) : M(M), DestM(DestM) {} + +public: + // Enumeration of ways arguments can be passed to outlined functions. + enum class ArgStructMode { + None, // Pass arguments directly. + Static, // Statically allocate a structure to store arguments. + Dynamic // Dynamically allocate a structure to store arguments. + }; + + TapirTarget(Module &M) : M(M), DestM(M) {} + virtual ~TapirTarget() {} + + virtual void setOptions(const TapirTargetOptions &Options) {} + + // Prepare the module for final Tapir lowering. + virtual void prepareModule() {} + + /// Lower a call to the tapir.loop.grainsize intrinsic into a grainsize + /// (coarsening) value. + virtual Value *lowerGrainsizeCall(CallInst *GrainsizeCall) = 0; + + /// Lower a call to the task.frameaddress intrinsic to get the frame pointer + /// for the containing function, i.e., after the task has been outlined. + virtual void lowerTaskFrameAddrCall(CallInst *TaskFrameAddrCall); + + /// Lower a Tapir sync instruction SI. + virtual void lowerSync(SyncInst &SI) = 0; + + virtual void lowerReducerOperation(CallBase *Call) { + } + + /// Lower calls to the tapir.runtime.{start,end} intrinsics. Only + /// tapir.runtime.start intrinsics are stored; uses of those intrinsics + /// identify the tapir.runtime.end intrinsics to lower. + virtual void lowerTapirRTCalls(SmallVectorImpl &TapirRTCalls, + Function &F, BasicBlock *TFEntry); + + // TODO: Add more options to control outlining. + + /// Returns true if Function F should be processed. + virtual bool shouldProcessFunction(const Function &F) const; + + /// Returns true if tasks in Function F should be outlined into their own + /// functions. Such outlining is a common step for many Tapir backends. + virtual bool shouldDoOutlining(const Function &F) const { return true; } + + /// Process Function F before any function outlining is performed. This + /// routine should not modify the CFG structure, unless it processes all Tapir + /// instructions in F itself. Returns true if it modifies the CFG, false + /// otherwise. + virtual bool preProcessFunction(Function &F, TaskInfo &TI, + bool ProcessingTapirLoops = false) = 0; + + /// Returns an ArgStructMode enum value describing how inputs to a task should + /// be passed to the task, e.g., directly as arguments to the outlined + /// function, or marshalled in a structure. + virtual ArgStructMode getArgStructMode() const { return ArgStructMode::None; } + + /// Get the return type of an outlined function for a task. + virtual Type *getReturnType() const { + return Type::getVoidTy(DestM.getContext()); + } + + /// Get the Module where outlined Helper will be placed. + Module &getDestinationModule() const { return DestM; } + + // Add attributes to the Function Helper produced from outlining a task. + virtual void addHelperAttributes(Function &Helper) {} + + // Remap any Target-local structures after taskframe starting at TFEntry is + // outlined. + virtual void remapAfterOutlining(BasicBlock *TFEntry, + ValueToValueMapTy &VMap) {} + + // Pre-process the Function F that has just been outlined from a task. This + // routine is executed on each outlined function by traversing in post-order + // the tasks in the original function. + virtual void preProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, + bool IsSpawner, BasicBlock *TFEntry) = 0; + + // Post-process the Function F that has just been outlined from a task. This + // routine is executed on each outlined function by traversing in post-order + // the tasks in the original function. + virtual void postProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, + bool IsSpawner, BasicBlock *TFEntry) = 0; + + // Pre-process the root Function F as a function that can spawn subtasks. + virtual void preProcessRootSpawner(Function &F, BasicBlock *TFEntry) = 0; + + // Post-process the root Function F as a function that can spawn subtasks. + virtual void postProcessRootSpawner(Function &F, BasicBlock *TFEntry) = 0; + + // Process the invocation of a task for an outlined function. This routine is + // invoked after processSpawner once for each child subtask. + virtual void processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) = 0; + + // Process Function F at the end of the lowering process. + virtual void postProcessFunction(Function &F, + bool ProcessingTapirLoops = false) = 0; + + // Process a generated helper Function F produced via outlining, at the end of + // the lowering process. + virtual void postProcessHelper(Function &F) = 0; + + virtual bool processOrdinaryFunction(Function &F, BasicBlock *TFEntry); + + // Get the LoopOutlineProcessor associated with this Tapir target. + virtual LoopOutlineProcessor * + getLoopOutlineProcessor(const TapirLoopInfo *TL) const { + return nullptr; + } +}; + +/// A loop-outline processor customizes the transformation of Tapir loops, +/// outlined via LoopSpawningTI, for a particular back-end. A loop-outline +/// processor serves a similar role for the LoopSpawningTI pass as a TapirTarget +/// serves for Tapir lowering. +/// +/// The LoopSpawningTI pass outlines Tapir loops by examining each Function F in +/// a Module and performing the following algorithm: +/// +/// 1) Analyze all loops in Function F to discover Tapir loops that are amenable +/// to LoopSpawningTI. +/// +/// 2) Run TapirTarget::preProcessFunction(F, OutliningTapirLoops = true). +/// +/// 3) Process each Tapir loop L as follows: +/// +/// a) Prepare the set of inputs to the helper function derived from the Tapir +/// task in L, using the return value of OutlineProcessor::getArgStructMode() +/// to guide this preparation. For example, if getArgStructMode() != None, +/// insert code to allocate a structure and marshal the inputs in that +/// structure. +/// +/// b) Run OutlineProcessor::setupLoopOutlineArgs() to get the complete set +/// of inputs for the outlined helper function for L. +/// +/// c) Outline L into a Function Helper, whose inputs are the prepared set of +/// inputs produced in step 2b and whose return type is void. This outlining +/// step uses OutlineProcessor::getIVArgIndex() and +/// OutlineProcessor::getLimitArgIndex() to identify the helper input +/// parameters that specify the strating and ending iterations, respectively. +/// +/// d) Call OutlineProcessor::postProcessOutline(Helper). +/// +/// 4) For each Tapir loop L in F in post-order, run +/// OutlineProcessor::processOutlinedLoopCall(). +/// +/// 5) Run TapirTarget::postProcessFunction(F, OutliningTapirLoops = true). +/// +/// Two generic loop-outline processors are provided with LoopSpawningTI. The +/// default loop-outline processor performs no special modifications to outlined +/// Tapir loops. The DACSpawning loop-outline processor transforms an outlined +/// Tapir loop to evaluate the iterations using parallel recursive +/// divide-and-conquer. +class LoopOutlineProcessor { +protected: + /// The Module of the original Tapir code. + Module &M; + /// The Module into which the outlined Helper functions will be placed. + Module &DestM; + + LoopOutlineProcessor(Module &M, Module &DestM) : M(M), DestM(DestM) {} +public: + using ArgStructMode = TapirTarget::ArgStructMode; + + LoopOutlineProcessor(Module &M) : M(M), DestM(M) {} + virtual ~LoopOutlineProcessor() = default; + + /// Returns an ArgStructMode enum value describing how inputs to the + /// underlying task of a Tapir loop should be passed to the task, e.g., + /// directly as arguments to the outlined function, or marshalled in a + /// structure. + virtual ArgStructMode getArgStructMode() const { + return ArgStructMode::None; + } + + /// Prepares the set HelperArgs of function arguments for the outlined helper + /// function Helper for a Tapir loop. Also prepares the list HelperInputs of + /// input values passed to a call to Helper. HelperArgs and HelperInputs are + /// derived from the loop-control arguments LCArgs and loop-control inputs + /// LCInputs for the Tapir loop, as well the set TLInputsFixed of arguments to + /// the task underlying the Tapir loop. + virtual void setupLoopOutlineArgs( + Function &F, ValueSet &HelperArgs, SmallVectorImpl &HelperInputs, + ValueSet &InputSet, const SmallVectorImpl &LCArgs, + const SmallVectorImpl &LCInputs, const ValueSet &TLInputsFixed); + + /// Get the Module where outlined Helper will be placed. + Module &getDestinationModule() const { return DestM; } + + /// Returns an integer identifying the index of the helper-function argument + /// in Args that specifies the starting iteration number. This return value + /// must complement the behavior of setupLoopOutlineArgs(). + virtual unsigned getIVArgIndex(const Function &F, const ValueSet &Args) const; + + /// Returns an integer identifying the index of the helper-function argument + /// in Args that specifies the ending iteration number. This return value + /// must complement the behavior of setupLoopOutlineArgs(). + virtual unsigned getLimitArgIndex(const Function &F, + const ValueSet &Args) const { + return getIVArgIndex(F, Args) + 1; + } + + /// Processes an outlined Function Helper for a Tapir loop, just after the + /// function has been outlined. + virtual void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, + ValueToValueMapTy &VMap); + + /// Add syncs to the escape points of each helper function. This operation is + /// a common post-processing step for outlined helper functions. + void addSyncToOutlineReturns(TapirLoopInfo &TL, TaskOutlineInfo &Out, + ValueToValueMapTy &VMap); + + /// Move Cilksan instrumentation out of cloned loop. + void moveCilksanInstrumentation(TapirLoopInfo &TL, TaskOutlineInfo &Out, + ValueToValueMapTy &VMap); + + /// Remap any data members of the LoopOutlineProcessor. This method is called + /// whenever a loop L is outlined, in order to update data for subloops of L. + virtual void remapData(ValueToValueMapTy &VMap) {}; + + /// Processes a call to an outlined Function Helper for a Tapir loop. + virtual void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, + DominatorTree &DT) {} +}; + +/// Generate a TapirTarget object for the specified TapirTargetID. +TapirTarget *getTapirTargetFromID(Module &M, TapirTargetID TargetID); + +/// Find all inputs to tasks within a function \p F, including nested tasks. +TaskValueSetMap findAllTaskInputs(Function &F, const DominatorTree &DT, + const TaskInfo &TI); + +void getTaskFrameInputsOutputs(TFValueSetMap &TFInputs, + TFValueSetMap &TFOutputs, + const Spindle &TF, const ValueSet *TaskInputs, + const TaskInfo &TI, const DominatorTree &DT); + +void findAllTaskFrameInputs(TFValueSetMap &TFInputs, + TFValueSetMap &TFOutputs, + const SmallVectorImpl &AllTaskFrames, + Function &F, const DominatorTree &DT, TaskInfo &TI); + +/// Create a struct to store the inputs to pass to an outlined function for the +/// task \p T. Stores into the struct will be inserted \p StorePt, which should +/// precede the detach. Loads from the struct will be inserted at \p LoadPt, +/// which should be inside \p T. If a Tapir loop \p TapirL is specified, then +/// its header block is also considered a valid load point. +std::pair +createTaskArgsStruct(const ValueSet &Inputs, Task *T, Instruction *StorePt, + Instruction *LoadPt, bool staticStruct, + ValueToValueMapTy &InputsMap, + Loop *TapirL = nullptr); + +/// Organize the set \p Inputs of values in \p F into a set \p Fixed of values +/// that can be used as inputs to a helper function. +void fixupInputSet(Function &F, const ValueSet &Inputs, ValueSet &Fixed); + +/// Organize the inputs to task \p T, given in \p TaskInputs, to create an +/// appropriate set of inputs, \p HelperInputs, to pass to the outlined function +/// for \p T. If a Tapir loop \p TapirL is specified, then its header block is +/// also used in fixing up inputs. +Instruction *fixupHelperInputs(Function &F, Task *T, ValueSet &TaskInputs, + ValueSet &HelperInputs, Instruction *StorePt, + Instruction *LoadPt, + TapirTarget::ArgStructMode useArgStruct, + ValueToValueMapTy &InputsMap, + Loop *TapirL = nullptr); + +/// Returns true if BasicBlock \p B is the immediate successor of only +/// detached-rethrow instructions. +bool isSuccessorOfDetachedRethrow(const BasicBlock *B); + +/// Collect the set of blocks in task \p T. All blocks enclosed by \p T will be +/// pushed onto \p TaskBlocks. The set of blocks terminated by reattaches from +/// \p T are added to \p ReattachBlocks. The set of blocks terminated by +/// detached-rethrow instructions are added to \p DetachedRethrowBlocks. The +/// set of entry points to exception-handling blocks shared by \p T and other +/// tasks in the same function are added to \p SharedEHEntries. +void getTaskBlocks(Task *T, std::vector &TaskBlocks, + SmallPtrSetImpl &ReattachBlocks, + SmallPtrSetImpl &TaskResumeBlocks, + SmallPtrSetImpl &SharedEHEntries, + const DominatorTree *DT); + +/// Outlines the content of task \p T in function \p F into a new helper +/// function. The parameter \p Inputs specified the inputs to the helper +/// function. The map \p VMap is updated with the mapping of instructions in +/// \p T to instructions in the new helper function. +Function *createHelperForTask( + Function &F, Task *T, ValueSet &Inputs, Module *DestM, + ValueToValueMapTy &VMap, Type *ReturnType, OutlineAnalysis &OA); + +/// Outlines the content of taskframe \p TF in function \p F into a new helper +/// function. The parameter \p Inputs specified the inputs to the helper +/// function. The map \p VMap is updated with the mapping of instructions in \p +/// TF to instructions in the new helper function. +Function *createHelperForTaskFrame( + Function &F, Spindle *TF, ValueSet &Args, Module *DestM, + ValueToValueMapTy &VMap, Type *ReturnType, OutlineAnalysis &OA); + +/// Replaces the taskframe \p TF, with associated TaskOutlineInfo \p Out, with a +/// call or invoke to the outlined helper function created for \p TF. +Instruction *replaceTaskFrameWithCallToOutline( + Spindle *TF, TaskOutlineInfo &Out, SmallVectorImpl &OutlineInputs); + +/// Outlines a task \p T into a helper function that accepts the inputs \p +/// Inputs. The map \p VMap is updated with the mapping of instructions in \p T +/// to instructions in the new helper function. Information about the helper +/// function is returned as a TaskOutlineInfo structure. +TaskOutlineInfo outlineTask( + Task *T, ValueSet &Inputs, SmallVectorImpl &HelperInputs, + Module *DestM, ValueToValueMapTy &VMap, + TapirTarget::ArgStructMode useArgStruct, Type *ReturnType, + ValueToValueMapTy &InputMap, OutlineAnalysis &OA); + +/// Outlines a taskframe \p TF into a helper function that accepts the inputs \p +/// Inputs. The map \p VMap is updated with the mapping of instructions in \p +/// TF to instructions in the new helper function. Information about the helper +/// function is returned as a TaskOutlineInfo structure. +TaskOutlineInfo outlineTaskFrame( + Spindle *TF, ValueSet &Inputs, SmallVectorImpl &HelperInputs, + Module *DestM, ValueToValueMapTy &VMap, + TapirTarget::ArgStructMode useArgStruct, Type *ReturnType, + ValueToValueMapTy &InputMap, OutlineAnalysis &OA); + +//----------------------------------------------------------------------------// +// Methods for lowering Tapir loops + +/// Given a Tapir loop \p TL and the set of inputs to the task inside that loop, +/// returns the set of inputs for the Tapir loop itself. +ValueSet getTapirLoopInputs(TapirLoopInfo *TL, ValueSet &TaskInputs); + + +/// Replaces the Tapir loop \p TL, with associated TaskOutlineInfo \p Out, with +/// a call or invoke to the outlined helper function created for \p TL. +Instruction *replaceLoopWithCallToOutline( + TapirLoopInfo *TL, TaskOutlineInfo &Out, + SmallVectorImpl &OutlineInputs); + +} // end namepsace llvm + +#endif diff --git a/llvm/include/llvm/Transforms/Tapir/OMPTaskABI.h b/llvm/include/llvm/Transforms/Tapir/OMPTaskABI.h new file mode 100644 index 000000000000000..87b97aff36bbcc1 --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/OMPTaskABI.h @@ -0,0 +1,99 @@ +//===- OMPTaskABI.h - Generic interface to runtime systems -------*- C++ -*--=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the OMP Task ABI to convert Tapir instructions to calls +// into kmpc task runtime calls. +// +//===----------------------------------------------------------------------===// +#ifndef OMPTASK_ABI_H_ +#define OMPTASK_ABI_H_ + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Transforms/Tapir/LoweringUtils.h" + +namespace llvm { +class Value; +class TapirLoopInfo; + +class OMPTaskABI final : public TapirTarget { + ValueToValueMapTy DetachCtxToStackFrame; + + StringRef RuntimeBCPath = ""; + + // Runtime stack structure + StructType *StackFrameTy = nullptr; + StructType *TaskTy = nullptr; + FunctionType *SpawnBodyFnTy = nullptr; + Type *SpawnBodyFnArgTy = nullptr; + Type *SpawnBodyFnArgSizeTy = nullptr; + + // Runtime functions + FunctionCallee RTSEnterFrame = nullptr; + FunctionCallee RTSGetArgsFromTask = nullptr; + FunctionCallee RTSSpawn = nullptr; + FunctionCallee RTSSync = nullptr; + FunctionCallee RTSSyncNoThrow = nullptr; + + FunctionCallee RTSLoopGrainsize8 = nullptr; + FunctionCallee RTSLoopGrainsize16 = nullptr; + FunctionCallee RTSLoopGrainsize32 = nullptr; + FunctionCallee RTSLoopGrainsize64 = nullptr; + + FunctionCallee RTSGetNumWorkers = nullptr; + FunctionCallee RTSGetWorkerID = nullptr; + + Align StackFrameAlign{8}; + + Value *CreateStackFrame(Function &F); + Value *GetOrCreateStackFrame(Function &F); + + CallInst *InsertStackFramePush(Function &F, + Instruction *TaskFrameCreate = nullptr, + bool Helper = false); + void InsertStackFramePop(Function &F, bool PromoteCallsToInvokes, + bool InsertPauseFrame, bool Helper); + +public: + OMPTaskABI(Module &M) : TapirTarget(M) {} + ~OMPTaskABI() { DetachCtxToStackFrame.clear(); } + + // void setOptions(const TapirTargetOptions &Options) override final; + + void prepareModule() override final; + Value *lowerGrainsizeCall(CallInst *GrainsizeCall) override final; + void lowerSync(SyncInst &SI) override final; + // void lowerReducerOperation(CallBase *CI) override; + + ArgStructMode getArgStructMode() const override final { + return ArgStructMode::Static; + } + void addHelperAttributes(Function &F) override final; + + bool preProcessFunction(Function &F, TaskInfo &TI, + bool ProcessingTapirLoops) override final; + void postProcessFunction(Function &F, + bool ProcessingTapirLoops) override final; + void postProcessHelper(Function &F) override final; + + void preProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final; + void postProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final; + void preProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final; + void postProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final; + void processSubTaskCall(TaskOutlineInfo &TOI, + DominatorTree &DT) override final; + +}; +} // namespace llvm + +#endif // OMPTASK_ABI_H diff --git a/llvm/include/llvm/Transforms/Tapir/OpenCilkABI.h b/llvm/include/llvm/Transforms/Tapir/OpenCilkABI.h new file mode 100644 index 000000000000000..a4bae8a902ca5ff --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/OpenCilkABI.h @@ -0,0 +1,197 @@ +//===- OpenilkABI.h - Interface to the OpenCilk runtime system ---*- C++ -*--=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the OpenCilk ABI to convert Tapir instructions to calls +// into the OpenCilk runtime system. +// +//===----------------------------------------------------------------------===// +#ifndef OPEN_CILK_ABI_H_ +#define OPEN_CILK_ABI_H_ + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Transforms/Tapir/LoweringUtils.h" + +namespace llvm { +class Value; +class TapirLoopInfo; + +class OpenCilkABI final : public TapirTarget { + ValueToValueMapTy DetachCtxToStackFrame; + SmallPtrSet CallsToInline; + DenseMap> TapirRTCalls; + ValueToValueMapTy DefaultSyncLandingpad; + + StringRef RuntimeBCPath = ""; + + // Cilk RTS data types + StructType *StackFrameTy = nullptr; + StructType *WorkerTy = nullptr; + + // Opaque Cilk RTS functions + FunctionCallee CilkRTSEnterFrame = nullptr; + FunctionCallee CilkRTSEnterFrameHelper = nullptr; + FunctionCallee CilkRTSDetach = nullptr; + FunctionCallee CilkRTSLeaveFrame = nullptr; + FunctionCallee CilkRTSLeaveFrameHelper = nullptr; + FunctionCallee CilkPrepareSpawn = nullptr; + FunctionCallee CilkSync = nullptr; + FunctionCallee CilkSyncNoThrow = nullptr; + FunctionCallee CilkParentEpilogue = nullptr; + FunctionCallee CilkHelperEpilogue = nullptr; + FunctionCallee CilkRTSEnterLandingpad = nullptr; + FunctionCallee CilkRTSPauseFrame = nullptr; + + FunctionCallee CilkRTSReducerRegister32 = nullptr; + FunctionCallee CilkRTSReducerRegister64 = nullptr; + FunctionCallee CilkRTSReducerUnregister = nullptr; + FunctionCallee CilkRTSReducerLookup = nullptr; + + // Accessors for opaque Cilk RTS functions + FunctionCallee CilkHelperEpilogueExn = nullptr; + FunctionCallee CilkRTSCilkForGrainsize8 = nullptr; + FunctionCallee CilkRTSCilkForGrainsize16 = nullptr; + FunctionCallee CilkRTSCilkForGrainsize32 = nullptr; + FunctionCallee CilkRTSCilkForGrainsize64 = nullptr; + + MaybeAlign StackFrameAlign{8}; + + // Accessors for CilkRTS ABI functions. When a bitcode file is loaded, these + // functions should return the function defined in the bitcode file. + // Otherwise, these functions will return FunctionCallees for placeholder + // declarations of these functions. The latter case is intended for debugging + // ABI-call insertion. + FunctionCallee Get__cilkrts_enter_frame() { + return CilkRTSEnterFrame; + } + FunctionCallee Get__cilkrts_enter_frame_helper() { + return CilkRTSEnterFrameHelper; + } + FunctionCallee Get__cilkrts_detach() { + return CilkRTSDetach; + } + FunctionCallee Get__cilkrts_leave_frame() { + return CilkRTSLeaveFrame; + } + FunctionCallee Get__cilkrts_leave_frame_helper() { + return CilkRTSLeaveFrameHelper; + } + FunctionCallee Get__cilkrts_pause_frame() { + return CilkRTSPauseFrame; + } + FunctionCallee Get__cilkrts_enter_landingpad() { + return CilkRTSEnterLandingpad; + } + FunctionCallee Get__cilkrts_cilk_for_grainsize_8() { + return CilkRTSCilkForGrainsize8; + } + FunctionCallee Get__cilkrts_cilk_for_grainsize_16() { + return CilkRTSCilkForGrainsize16; + } + FunctionCallee Get__cilkrts_cilk_for_grainsize_32() { + return CilkRTSCilkForGrainsize32; + } + FunctionCallee Get__cilkrts_cilk_for_grainsize_64() { + return CilkRTSCilkForGrainsize64; + } + FunctionCallee Get__cilkrts_reducer_register(unsigned Bits) { + if (Bits == 32) + return CilkRTSReducerRegister32; + if (Bits == 64) + return CilkRTSReducerRegister64; + return 0; + } + FunctionCallee Get__cilkrts_reducer_unregister() { + return CilkRTSReducerUnregister; + } + FunctionCallee Get__cilkrts_reducer_lookup() { + return CilkRTSReducerLookup; + } + + // Helper functions for implementing the Cilk ABI protocol + FunctionCallee GetCilkPrepareSpawnFn() { + return CilkPrepareSpawn; + } + FunctionCallee GetCilkSyncFn() { + return CilkSync; + } + FunctionCallee GetCilkSyncNoThrowFn() { + return CilkSyncNoThrow; + } + FunctionCallee GetCilkParentEpilogueFn() { + return CilkParentEpilogue; + } + FunctionCallee GetCilkHelperEpilogueFn() { + return CilkHelperEpilogue; + } + FunctionCallee GetCilkHelperEpilogueExnFn() { + return CilkHelperEpilogueExn; + } + + void GetTapirRTCalls(Spindle *TaskFrame, bool IsRootTask, TaskInfo &TI); + void LowerTapirRTCalls(Function &F, BasicBlock *TFEntry); + + Value *CreateStackFrame(Function &F); + Value *GetOrCreateCilkStackFrame(Function &F); + + CallInst *InsertStackFramePush(Function &F, + Instruction *TaskFrameCreate = nullptr, + bool Helper = false); + void InsertStackFramePop(Function &F, bool PromoteCallsToInvokes, + bool InsertPauseFrame, bool Helper); + + void InsertDetach(Function &F, Instruction *DetachPt); + + void MarkSpawner(Function &F); + + BasicBlock *GetDefaultSyncLandingpad(Function &F, Value *SF, DebugLoc Loc); + +public: + OpenCilkABI(Module &M); + ~OpenCilkABI() { DetachCtxToStackFrame.clear(); } + + void setOptions(const TapirTargetOptions &Options) override final; + + void prepareModule() override final; + Value *lowerGrainsizeCall(CallInst *GrainsizeCall) override final; + void lowerSync(SyncInst &SI) override final; + void lowerReducerOperation(CallBase *CI) override; + + ArgStructMode getArgStructMode() const override final { + return ArgStructMode::None; + } + void addHelperAttributes(Function &F) override final; + + void remapAfterOutlining(BasicBlock *TFEntry, + ValueToValueMapTy &VMap) override final; + + bool preProcessFunction(Function &F, TaskInfo &TI, + bool ProcessingTapirLoops) override final; + void postProcessFunction(Function &F, + bool ProcessingTapirLoops) override final; + void postProcessHelper(Function &F) override final; + + void preProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final; + void postProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final; + void preProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final; + void postProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final; + void processSubTaskCall(TaskOutlineInfo &TOI, + DominatorTree &DT) override final; + bool processOrdinaryFunction(Function &F, BasicBlock *TFEntry) override final; + + LoopOutlineProcessor * + getLoopOutlineProcessor(const TapirLoopInfo *TL) const override final; +}; +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/Transforms/Tapir/OpenMPABI.h b/llvm/include/llvm/Transforms/Tapir/OpenMPABI.h new file mode 100644 index 000000000000000..f7fcb4ad417e302 --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/OpenMPABI.h @@ -0,0 +1,66 @@ +//===- OpenMPABI.h - Interface to the OpenMP runtime -----------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the OpenMP ABI to converts Tapir instructions to calls +// into the OpenMP runtime system. +// +//===----------------------------------------------------------------------===// +#ifndef OMP_ABI_H_ +#define OMP_ABI_H_ + +#include "llvm/Transforms/Tapir/LoweringUtils.h" + +namespace llvm { +class Value; + +enum OpenMPRuntimeFunction { + OMPRTL__kmpc_fork_call, + OMPRTL__kmpc_for_static_init_4, + OMPRTL__kmpc_for_static_fini, + OMPRTL__kmpc_master, + OMPRTL__kmpc_end_master, + OMPRTL__kmpc_omp_task_alloc, + OMPRTL__kmpc_omp_task, + OMPRTL__kmpc_omp_taskwait, + OMPRTL__kmpc_global_thread_num, + OMPRTL__kmpc_barrier, + OMPRTL__kmpc_global_num_threads, +}; + +enum OpenMPSchedType { + OMP_sch_static = 34, +}; + +class OpenMPABI : public TapirTarget { +public: + OpenMPABI(Module &M); + Value *lowerGrainsizeCall(CallInst *GrainsizeCall) override final; + void lowerSync(SyncInst &SI) override final; + + bool preProcessFunction(Function &F, TaskInfo &TI, + bool ProcessingTapirLoops) override final; + void postProcessFunction(Function &F, + bool ProcessingTapirLoops) override final; + void postProcessHelper(Function &F) override final; + + void preProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final; + void postProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final; + void preProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final; + void postProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final; + void processSubTaskCall(TaskOutlineInfo &TOI, + DominatorTree &DT) override final; +}; + +} // end of llvm namespace + +#endif diff --git a/llvm/include/llvm/Transforms/Tapir/Outline.h b/llvm/include/llvm/Transforms/Tapir/Outline.h new file mode 100644 index 000000000000000..e94b20840eef5de --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/Outline.h @@ -0,0 +1,90 @@ +//===- Outline.h - Outlining for Tapir -------------------------*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines helper functions for outlining portions of code containing +// Tapir instructions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_TAPIR_OUTLINE_H +#define LLVM_TRANSFORMS_TAPIR_OUTLINE_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +namespace llvm { + +using ValueSet = SetVector; + +// Value materializer for Tapir outlining. +class OutlineMaterializer : public ValueMaterializer { + const Value *SrcSyncRegion = nullptr; +public: + OutlineMaterializer(const Value *SrcSyncRegion = nullptr) + : SrcSyncRegion(SrcSyncRegion) {} + virtual ~OutlineMaterializer() { + BlocksToRemap.clear(); + } + + Value *materialize(Value *V) override; + + SetVector BlocksToRemap; +}; + +/// Clone Blocks into NewFunc, transforming the old arguments into references to +/// VMap values. +/// +/// TODO: Fix the std::vector part of the type of this function. +void CloneIntoFunction( + Function *NewFunc, const Function *OldFunc, + std::vector Blocks, ValueToValueMapTy &VMap, + bool ModuleLevelChanges, SmallVectorImpl &Returns, + const StringRef NameSuffix, + SmallPtrSetImpl *ReattachBlocks = nullptr, + SmallPtrSetImpl *DetachedRethrowBlocks = nullptr, + SmallPtrSetImpl *SharedEHEntries = nullptr, + DISubprogram *SP = nullptr, ClonedCodeInfo *CodeInfo = nullptr, + ValueMapTypeRemapper *TypeMapper = nullptr, + OutlineMaterializer *Materializer = nullptr); + +/// Create a helper function whose signature is based on Inputs and +/// Outputs as follows: f(in0, ..., inN, out0, ..., outN) +/// +/// TODO: Fix the std::vector part of the type of this function. +Function * +CreateHelper(const ValueSet &Inputs, const ValueSet &Outputs, + std::vector Blocks, BasicBlock *Header, + const BasicBlock *OldEntry, const BasicBlock *OldExit, + ValueToValueMapTy &VMap, Module *DestM, bool ModuleLevelChanges, + SmallVectorImpl &Returns, const StringRef NameSuffix, + SmallPtrSetImpl *ReattachBlocks = nullptr, + SmallPtrSetImpl *TaskResumeBlocks = nullptr, + SmallPtrSetImpl *SharedEHEntries = nullptr, + const BasicBlock *OldUnwind = nullptr, + SmallPtrSetImpl *UnreachableExits = nullptr, + Type *ReturnType = nullptr, ClonedCodeInfo *CodeInfo = nullptr, + ValueMapTypeRemapper *TypeMapper = nullptr, + OutlineMaterializer *Materializer = nullptr); + +// Add alignment assumptions to parameters of outlined function, based on known +// alignment data in the caller. +void AddAlignmentAssumptions(const Function *Caller, const ValueSet &Args, + ValueToValueMapTy &VMap, + const Instruction *CallSite, + AssumptionCache *AC, DominatorTree *DT); + +} // End llvm namespace + +#endif diff --git a/llvm/include/llvm/Transforms/Tapir/QthreadsABI.h b/llvm/include/llvm/Transforms/Tapir/QthreadsABI.h new file mode 100644 index 000000000000000..6b3b4bbf5b55cfc --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/QthreadsABI.h @@ -0,0 +1,82 @@ +//===- QthreadsABI.h - Interface to the Qthreads runtime ----*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Qthreads back end to convert Tapir instructions to +// calls into the Qthreads runtime system. +// +//===----------------------------------------------------------------------===// +#ifndef QTHREADS_ABI_H_ +#define QTHREADS_ABI_H_ + +#include "llvm/Transforms/Tapir/LoweringUtils.h" + +namespace llvm { + +class QthreadsABI : public TapirTarget { + ValueToValueMapTy SyncRegionToSinc; + + Type *QthreadFTy = nullptr; + + // Opaque Qthreads RTS functions + FunctionCallee QthreadNumWorkers = nullptr; + FunctionCallee QthreadForkCopyargs = nullptr; + FunctionCallee QthreadInitialize = nullptr; + FunctionCallee QtSincCreate = nullptr; + FunctionCallee QtSincExpect = nullptr; + FunctionCallee QtSincSubmit = nullptr; + FunctionCallee QtSincWait = nullptr; + FunctionCallee QtSincDestroy = nullptr; + + // Accessors for opaque Qthreads RTS functions + FunctionCallee get_qthread_num_workers(); + FunctionCallee get_qthread_fork_copyargs(); + FunctionCallee get_qthread_initialize(); + FunctionCallee get_qt_sinc_create(); + FunctionCallee get_qt_sinc_expect(); + FunctionCallee get_qt_sinc_submit(); + FunctionCallee get_qt_sinc_wait(); + FunctionCallee get_qt_sinc_destroy(); + + Value *getOrCreateSinc(Value *SyncRegion, Function *F); +public: + QthreadsABI(Module &M); + ~QthreadsABI() { SyncRegionToSinc.clear(); } + + ArgStructMode getArgStructMode() const override final { + return ArgStructMode::Static; + } + Type *getReturnType() const override final { + return Type::getInt32Ty(M.getContext()); + } + + Value *lowerGrainsizeCall(CallInst *GrainsizeCall) override final; + void lowerSync(SyncInst &SI) override final; + + bool preProcessFunction(Function &F, TaskInfo &TI, + bool ProcessingTapirLoops) override final; + void postProcessFunction(Function &F, + bool ProcessingTapirLoops) override final; + void postProcessHelper(Function &F) override final; + + void preProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final {} + void postProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final {} + void preProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final {} + void postProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final { + } + void processSubTaskCall(TaskOutlineInfo &TOI, + DominatorTree &DT) override final; +}; + +} // end of llvm namespace + +#endif diff --git a/llvm/include/llvm/Transforms/Tapir/SerialABI.h b/llvm/include/llvm/Transforms/Tapir/SerialABI.h new file mode 100644 index 000000000000000..c7cd7f5b85306bc --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/SerialABI.h @@ -0,0 +1,53 @@ +//===- SerialABI.h - Replace Tapir with serial projection ------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Serial back end, which is used to convert Tapir +// instructions into their serial projection. +// +//===----------------------------------------------------------------------===// +#ifndef SERIAL_ABI_H_ +#define SERIAL_ABI_H_ + +#include "llvm/Transforms/Tapir/LoweringUtils.h" + +namespace llvm { + +class SerialABI : public TapirTarget { +public: + SerialABI(Module &M) : TapirTarget(M) {} + ~SerialABI() {} + + Value *lowerGrainsizeCall(CallInst *GrainsizeCall) override final; + void lowerSync(SyncInst &inst) override final; + + bool shouldDoOutlining(const Function &F) const override final { + return false; + } + bool preProcessFunction(Function &F, TaskInfo &TI, + bool ProcessingTapirLoops) override final; + void postProcessFunction(Function &F, + bool ProcessingTapirLoops) override final {} + void postProcessHelper(Function &F) override final {} + + void preProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final {} + void postProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, bool IsSpawner, + BasicBlock *TFEntry) override final {} + void preProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final {} + void postProcessRootSpawner(Function &F, BasicBlock *TFEntry) override final { + } + void processSubTaskCall(TaskOutlineInfo &TOI, + DominatorTree &DT) override final {} +}; + +} // end of llvm namespace + +#endif diff --git a/llvm/include/llvm/Transforms/Tapir/SerializeSmallTasks.h b/llvm/include/llvm/Transforms/Tapir/SerializeSmallTasks.h new file mode 100644 index 000000000000000..8155b936037fd5c --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/SerializeSmallTasks.h @@ -0,0 +1,29 @@ +//===- SerializeSmallTasks.h - Serialize small Tapir tasks ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_TAPIR_SERIALIZESMALLTASKS_H_ +#define LLVM_TRANSFORMS_TAPIR_SERIALIZESMALLTASKS_H_ + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class Function; + +/// Pass to serialize small Tapir tasks, whose work is too little to overcome +/// the overhead of a spawn. +class SerializeSmallTasksPass : public PassInfoMixin { +public: + explicit SerializeSmallTasksPass() {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_TAPIR_SERIALIZESMALLTASKS_H_ diff --git a/llvm/include/llvm/Transforms/Tapir/TapirLoopInfo.h b/llvm/include/llvm/Transforms/Tapir/TapirLoopInfo.h new file mode 100644 index 000000000000000..2d35a0995a003d9 --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/TapirLoopInfo.h @@ -0,0 +1,252 @@ +//===- TapirLoopInfo.h - Utility functions for Tapir loops -----*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines utility functions for handling Tapir loops. +// +//===----------------------------------------------------------------------===// + +#ifndef TAPIR_LOOP_INFO_H_ +#define TAPIR_LOOP_INFO_H_ + +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Transforms/Tapir/LoweringUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +namespace llvm { + +class AssumptionCache; +class BasicBlock; +class DominatorTree; +class ICmpInst; +class Instruction; +class OptimizationRemarkAnalysis; +class OptimizationRemarkEmitter; +class PHINode; +class PredicatedScalarEvolution; +class ScalarEvolution; +class TargetTransformInfo; + +/// Class for managing information about a Tapir loop, primarily for the purpose +/// of outlining Tapir loops. +/// +/// A Tapir loop is defined as an ordinary Loop whose body -- all code in the +/// loop except for the indiction variables and loop control --- is contained in +/// a spawned task. +class TapirLoopInfo { +public: + /// InductionList saves induction variables and maps them to the induction + /// descriptor. + using InductionList = MapVector; + + TapirLoopInfo(Loop *L, Task *T) : TheLoop(L), TheTask(T) { + // Get the exit block for this loop. + Instruction *TI = TheLoop->getLoopLatch()->getTerminator(); + ExitBlock = TI->getSuccessor(0); + if (ExitBlock == TheLoop->getHeader()) + ExitBlock = TI->getSuccessor(1); + + // Get the unwind destination for this loop. + DetachInst *DI = T->getDetach(); + if (DI->hasUnwindDest()) + UnwindDest = DI->getUnwindDest(); + } + + /// Constructor that automatically reads the metadata for the loop. + TapirLoopInfo(Loop *L, Task *T, OptimizationRemarkEmitter &ORE) + : TapirLoopInfo(L, T) { + readTapirLoopMetadata(ORE); + } + + ~TapirLoopInfo() { + if (StartIterArg) + delete StartIterArg; + if (EndIterArg) + delete EndIterArg; + if (GrainsizeArg) + delete GrainsizeArg; + + DescendantTasks.clear(); + Inductions.clear(); + } + + Loop *getLoop() const { return TheLoop; } + Task *getTask() const { return TheTask; } + + /// Top-level call to prepare a Tapir loop for outlining. + bool prepareForOutlining( + DominatorTree &DT, LoopInfo &LI, TaskInfo &TI, + PredicatedScalarEvolution &PSE, AssumptionCache &AC, const char *PassName, + OptimizationRemarkEmitter &ORE, const TargetTransformInfo &TTI); + + /// Gather all induction variables in this loop that need special handling + /// during outlining. + bool collectIVs(PredicatedScalarEvolution &PSE, const char *PassName, + OptimizationRemarkEmitter *ORE); + + /// Replace all induction variables in this loop that are not primary with + /// stronger forms. + void replaceNonPrimaryIVs(PredicatedScalarEvolution &PSE); + + /// Identify the loop condition instruction, and determine if the loop uses an + /// inclusive or exclusive range. + bool getLoopCondition(const char *PassName, OptimizationRemarkEmitter *ORE); + + /// Fix up external users of the induction variable. + void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, + PredicatedScalarEvolution &PSE); + + /// Returns (and creates if needed) the original loop trip count. + const SCEV *getBackedgeTakenCount(PredicatedScalarEvolution &PSE) const; + const SCEV *getExitCount(const SCEV *BackedgeTakenCount, + PredicatedScalarEvolution &PSE) const; + // Return a non-overflowing value representing the trip count. For the + // typical case of a loop over a non-inclusive range (e.g., i \in [0,n), + // excluding n), this value is the backedge count plus 1. But to avoid + // overflow conditions, for a loop over an inclusive range (e.g., i \in [0,n], + // including n), this value is simply the backedge count. Passes are expected + // to use isInclusiveRange() to determine when they need to handle loops over + // inclusive ranges as a special case. + Value *getOrCreateTripCount(PredicatedScalarEvolution &PSE, + const char *PassName, + OptimizationRemarkEmitter *ORE); + + /// Record task T as a descendant task under this loop and not under a + /// descendant Tapir loop. + void addDescendantTask(Task *T) { DescendantTasks.push_back(T); } + + /// Adds \p Phi, with induction descriptor ID, to the inductions list. This + /// can set \p Phi as the main induction of the loop if \p Phi is a better + /// choice for the main induction than the existing one. + void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID); + + /// Returns the original loop trip count, if it has been computed. + Value *getTripCount() const { + assert(TripCount.pointsToAliveValue() && + "TripCount does not point to alive value."); + return TripCount; + } + + /// Returns the original loop condition, if it has been computed. + ICmpInst *getCondition() const { return Condition; } + + /// Returns true if this loop condition includes the end iteration. + bool isInclusiveRange() const { return InclusiveRange; } + + /// Returns the widest induction type. + Type *getWidestInductionType() const { return WidestIndTy; } + + /// Returns true if there is a primary induction variable for this Tapir loop. + bool hasPrimaryInduction() const { + return (nullptr != PrimaryInduction); + } + + /// Get the primary induction variable for this Tapir loop. + const std::pair &getPrimaryInduction() const { + assert(PrimaryInduction && "No primary induction."); + return *Inductions.find(PrimaryInduction); + } + + /// Returns the induction variables found in the loop. + InductionList *getInductionVars() { return &Inductions; } + + /// Get the grainsize associated with this Tapir Loop. A return value of 0 + /// indicates the absence of a specified grainsize. + unsigned getGrainsize() const { return Grainsize; } + + /// Get the exit block assoicated with this Tapir loop. + BasicBlock *getExitBlock() const { return ExitBlock; } + + /// Get the unwind destination for this Tapir loop. + BasicBlock *getUnwindDest() const { return UnwindDest; } + + /// Get the set of tasks enclosed in this Tapir loop and not a descendant + /// Tapir loop. + void getEnclosedTasks(SmallVectorImpl &TaskVec) const { + TaskVec.push_back(TheTask); + for (Task *T : reverse(DescendantTasks)) + TaskVec.push_back(T); + } + + /// Update information on this Tapir loop based on its metadata. + void readTapirLoopMetadata(OptimizationRemarkEmitter &ORE); + + /// Get the debug location for this loop. + DebugLoc getDebugLoc() const { return TheTask->getDetach()->getDebugLoc(); } + + /// Create an analysis remark that explains why the transformation failed + /// + /// \p RemarkName is the identifier for the remark. If \p I is passed it is + /// an instruction that prevents the transformation. Otherwise \p TheLoop is + /// used for the location of the remark. \return the remark object that can + /// be streamed to. + /// + /// Based on createMissedAnalysis in the LoopVectorize pass. + static OptimizationRemarkAnalysis + createMissedAnalysis(const char *PassName, StringRef RemarkName, + const Loop *TheLoop, Instruction *I = nullptr); + +private: + /// The loop that we evaluate. + Loop *TheLoop; + + /// The task contained in this loop. + Task *TheTask; + + /// Descendants of TheTask that are enclosed by this loop and not a descendant + /// Tapir loop. + SmallVector DescendantTasks; + + /// The single exit block for this Tapir loop. + BasicBlock *ExitBlock = nullptr; + + /// The unwind destination of this Tapir loop, if it has one. + BasicBlock *UnwindDest = nullptr; + + /// Holds the primary induction variable. This is the counter of the loop. + PHINode *PrimaryInduction = nullptr; + + /// Holds all of the induction variables that we found in the loop. Notice + /// that inductions don't need to start at zero and that induction variables + /// can be pointers. + InductionList Inductions; + + /// Holds the widest induction type encountered. + Type *WidestIndTy = nullptr; + + /// Trip count of the original loop. + WeakTrackingVH TripCount; + + /// Latch condition of the original loop. + ICmpInst *Condition = nullptr; + bool InclusiveRange = false; + + /// Grainsize value to use for loop. A value of 0 indicates that a call to + /// Tapir's grainsize intrinsic should be used. + unsigned Grainsize = 0; + +public: + /// Placeholder argument values. + Argument *StartIterArg = nullptr; + Argument *EndIterArg = nullptr; + Argument *GrainsizeArg = nullptr; +}; + +/// Transforms an induction descriptor into a direct computation of its value at +/// Index. +Value *emitTransformedIndex( + IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, + const InductionDescriptor &ID); + +} // end namepsace llvm + +#endif diff --git a/llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h b/llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h new file mode 100644 index 000000000000000..a187967e9e577ec --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h @@ -0,0 +1,83 @@ +//===- TapirTargetIDs.h - Tapir target ID's --------------------*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file enumerates the available Tapir lowering targets. +// +//===----------------------------------------------------------------------===// + +#ifndef TAPIR_TARGET_IDS_H_ +#define TAPIR_TARGET_IDS_H_ + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Casting.h" + +namespace llvm { + +enum class TapirTargetID { + None, // Perform no lowering + Serial, // Lower to serial projection + Cheetah, // Lower to the Cheetah ABI + Lambda, // Lower to generic Lambda ABI + OMPTask, // Lower to OpenMP task ABI + OpenCilk, // Lower to OpenCilk ABI + Qthreads, // Lower to Qthreads + Last_TapirTargetID +}; + +// Tapir target options + +// Virtual base class for Target-specific options. +class TapirTargetOptions { +public: + enum TapirTargetOptionKind { TTO_OpenCilk, Last_TTO }; + +private: + const TapirTargetOptionKind Kind; + +public: + TapirTargetOptionKind getKind() const { return Kind; } + + TapirTargetOptions(TapirTargetOptionKind K) : Kind(K) {} + TapirTargetOptions(const TapirTargetOptions &) = delete; + TapirTargetOptions &operator=(const TapirTargetOptions &) = delete; + virtual ~TapirTargetOptions() {} + + // Top-level method for cloning TapirTargetOptions. Defined in + // TargetLibraryInfo. + TapirTargetOptions *clone() const; +}; + +// Options for OpenCilkABI Tapir target. +class OpenCilkABIOptions : public TapirTargetOptions { + std::string RuntimeBCPath; + + OpenCilkABIOptions() = delete; + +public: + OpenCilkABIOptions(StringRef Path) + : TapirTargetOptions(TTO_OpenCilk), RuntimeBCPath(Path) {} + + StringRef getRuntimeBCPath() const { + return RuntimeBCPath; + } + + static bool classof(const TapirTargetOptions *TTO) { + return TTO->getKind() == TTO_OpenCilk; + } + +protected: + friend TapirTargetOptions; + + OpenCilkABIOptions *cloneImpl() const { + return new OpenCilkABIOptions(RuntimeBCPath); + } +}; + +} // end namespace llvm + +#endif diff --git a/llvm/include/llvm/Transforms/Tapir/TapirToTarget.h b/llvm/include/llvm/Transforms/Tapir/TapirToTarget.h new file mode 100644 index 000000000000000..8aa744f6e4ced93 --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/TapirToTarget.h @@ -0,0 +1,34 @@ +//===- TapirToTarget.h - Lower Tapir to target ABI --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass lowers Tapir construct to a specified runtime ABI. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_TAPIR_TAPIRTOTARGET_H +#define LLVM_TRANSFORMS_TAPIR_TAPIRTOTARGET_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Tapir/TapirTargetIDs.h" + +namespace llvm { + +/// The TapirToTarget Pass. +struct TapirToTargetPass : public PassInfoMixin { + TapirToTargetPass(TapirTargetID TargetID = TapirTargetID::Last_TapirTargetID) + : TargetID(TargetID) {} + + /// \brief Run the pass over the module. + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + + TapirTargetID TargetID; +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_TAPIR_TAPIRTOTARGET_H diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h index c99df6bf94d0259..0a5938a1c813539 100644 --- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h +++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h @@ -155,6 +155,7 @@ struct CriticalEdgeSplittingOptions { /// provided. If it cannot be preserved, no splitting will take place. If it /// is not set, preserve loop-simplify form if possible. bool PreserveLoopSimplify = true; + bool SplitDetachContinue = false; CriticalEdgeSplittingOptions(DominatorTree *DT = nullptr, LoopInfo *LI = nullptr, @@ -186,6 +187,11 @@ struct CriticalEdgeSplittingOptions { PreserveLoopSimplify = false; return *this; } + + CriticalEdgeSplittingOptions &setSplitDetachContinue() { + SplitDetachContinue = true; + return *this; + } }; /// When a loop exit edge is split, LCSSA form may require new PHIs in the new diff --git a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h index 429d6a2e05236fd..53a7333146c5969 100644 --- a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h +++ b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h @@ -76,6 +76,13 @@ namespace llvm { bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, StringRef Name); + /// Analyze the name the given function and set any applicable attributes. If + /// the library function is unavailable, this doesn't modify it. + /// + /// Returns true if any attributes were set and false otherwise. + bool inferTapirTargetLibFuncAttributes(Function &F, + const TargetLibraryInfo &TLI); + /// Check whether the overloaded floating point function /// corresponding to \a Ty is available. bool hasFloatFn(const Module *M, const TargetLibraryInfo *TLI, Type *Ty, diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h index 6226062dd713f62..1a5d0464a7e48d2 100644 --- a/llvm/include/llvm/Transforms/Utils/Cloning.h +++ b/llvm/include/llvm/Transforms/Utils/Cloning.h @@ -42,6 +42,7 @@ class Loop; class LoopInfo; class Module; class ProfileSummaryInfo; +class ResumeInst; class ReturnInst; class DomTreeUpdater; @@ -72,6 +73,9 @@ struct ClonedCodeInfo { /// are in the entry block but are not a constant size. bool ContainsDynamicAllocas = false; + /// This is set to true if the cloned code contains a detach instruction. + bool ContainsDetach = false; + /// All cloned call sites that have operand bundles attached are appended to /// this vector. This vector may contain nulls or undefs if some of the /// originally inserted callsites were DCE'ed after they were cloned. @@ -178,6 +182,7 @@ void CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, const Instruction *StartingInst, ValueToValueMapTy &VMap, bool ModuleLevelChanges, SmallVectorImpl &Returns, + SmallVectorImpl &Resumes, const char *NameSuffix = "", ClonedCodeInfo *CodeInfo = nullptr); @@ -195,6 +200,7 @@ void CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, void CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, ValueToValueMapTy &VMap, bool ModuleLevelChanges, SmallVectorImpl &Returns, + SmallVectorImpl &Resumes, const char *NameSuffix = "", ClonedCodeInfo *CodeInfo = nullptr); diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h index 6937ec8dfd21c74..4d6a6970149ff1d 100644 --- a/llvm/include/llvm/Transforms/Utils/Local.h +++ b/llvm/include/llvm/Transforms/Utils/Local.h @@ -402,6 +402,13 @@ Instruction *removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU = nullptr); bool removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU = nullptr, MemorySSAUpdater *MSSAU = nullptr); +/// Remove all detach-unwind blocks that do not catch exceptions from detached +/// tasks. +/// +/// Returns true if any basic block was removed. +bool removeDeadDetachUnwinds(Function &F, DomTreeUpdater *DTU = nullptr, + MemorySSAUpdater *MSSAU = nullptr); + /// Combine the metadata of two instructions so that K can replace J. Some /// metadata kinds can only be kept if K does not move, meaning it dominated /// J in the original IR. diff --git a/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h b/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h index 61bf93b74a15a0e..159269e6b33f463 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h @@ -23,6 +23,7 @@ class MemorySSAUpdater; class ScalarEvolution; struct SimplifyQuery; class TargetTransformInfo; +class TaskInfo; /// Convert a loop into a loop with bottom test. It may /// perform loop latch simplication as well if the flag RotationOnly @@ -32,8 +33,9 @@ class TargetTransformInfo; /// LoopRotation. If it is true, the profitability heuristic will be ignored. bool LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC, DominatorTree *DT, ScalarEvolution *SE, - MemorySSAUpdater *MSSAU, const SimplifyQuery &SQ, - bool RotationOnly, unsigned Threshold, bool IsUtilMode, + MemorySSAUpdater *MSSAU, TaskInfo *TI, + const SimplifyQuery &SQ, bool RotationOnly, + unsigned Threshold, bool IsUtilMode, bool PrepareForLTO = false); } // namespace llvm diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 56880bd4822c758..fc961d5e99050c6 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -42,6 +42,7 @@ class ScalarEvolution; class SCEV; class SCEVExpander; class TargetLibraryInfo; +class TaskInfo; class LPPassManager; class Instruction; struct RuntimeCheckingPtrGroup; @@ -153,8 +154,8 @@ class SinkAndHoistLICMFlags { bool sinkRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, TargetLibraryInfo *, TargetTransformInfo *, Loop *CurLoop, MemorySSAUpdater &, ICFLoopSafetyInfo *, - SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, - Loop *OutermostLoop = nullptr); + SinkAndHoistLICMFlags &, TaskInfo *, + OptimizationRemarkEmitter *, Loop *OutermostLoop = nullptr); /// Call sinkRegion on loops contained within the specified loop /// in order from innermost to outermost. @@ -162,7 +163,7 @@ bool sinkRegionForLoopNest(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, TargetLibraryInfo *, TargetTransformInfo *, Loop *, MemorySSAUpdater &, ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &, - OptimizationRemarkEmitter *); + TaskInfo *, OptimizationRemarkEmitter *); /// Walk the specified region of the CFG (defined by all blocks /// dominated by the specified block, and that are in the current loop) in depth @@ -177,8 +178,8 @@ bool sinkRegionForLoopNest(DomTreeNode *, AAResults *, LoopInfo *, bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, AssumptionCache *, TargetLibraryInfo *, Loop *, MemorySSAUpdater &, ScalarEvolution *, ICFLoopSafetyInfo *, - SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, bool, - bool AllowSpeculation); + SinkAndHoistLICMFlags &, TaskInfo *, + OptimizationRemarkEmitter *, bool, bool AllowSpeculation); /// Return true if the induction variable \p IV in a Loop whose latch is /// \p LatchBlock would become dead if the exit test \p Cond were removed. @@ -192,12 +193,13 @@ bool isAlmostDeadIV(PHINode *IV, BasicBlock *LatchBlock, Value *Cond); /// - The loop needs to have a Preheader /// - A unique dedicated exit block must exist /// -/// This also updates the relevant analysis information in \p DT, \p SE, \p LI -/// and \p MSSA if pointers to those are provided. +/// This also updates the relevant analysis information in \p DT, \p SE, \p LI, +/// \p TI and \p MSSA if pointers to those are provided. /// It also updates the loop PM if an updater struct is provided. void deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, - LoopInfo *LI, MemorySSA *MSSA = nullptr); + LoopInfo *LI, TaskInfo *TI = nullptr, + MemorySSA *MSSA = nullptr); /// Remove the backedge of the specified loop. Handles loop nests and general /// loop structures subject to the precondition that the loop has no parent @@ -220,8 +222,9 @@ bool promoteLoopAccessesToScalars( SmallVectorImpl &, SmallVectorImpl &, PredIteratorCache &, LoopInfo *, DominatorTree *, AssumptionCache *AC, const TargetLibraryInfo *, TargetTransformInfo *, Loop *, - MemorySSAUpdater &, ICFLoopSafetyInfo *, OptimizationRemarkEmitter *, - bool AllowSpeculation, bool HasReadsOutsideSet); + MemorySSAUpdater &, ICFLoopSafetyInfo *, TaskInfo *, + OptimizationRemarkEmitter *, bool AllowSpeculation, + bool HasReadsOutsideSet); /// Does a BFS from a given node to all of its children inside a given loop. /// The returned vector of nodes includes the starting point. @@ -307,6 +310,8 @@ TransformationMode hasUnrollAndJamTransformation(const Loop *L); TransformationMode hasVectorizeTransformation(const Loop *L); TransformationMode hasDistributeTransformation(const Loop *L); TransformationMode hasLICMVersioningTransformation(const Loop *L); +TransformationMode hasLoopStripmineTransformation(const Loop *L); +TransformationMode hasLoopSpawningTransformation(const Loop *L); /// @} /// Set input string into loop metadata by keeping other values intact. @@ -355,7 +360,7 @@ void getLoopAnalysisUsage(AnalysisUsage &AU); /// If \p ORE is set use it to emit optimization remarks. bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, Loop *CurLoop, MemorySSAUpdater &MSSAU, - bool TargetExecutesOncePerLoop, + bool TargetExecutesOncePerLoop, TaskInfo *TI, SinkAndHoistLICMFlags &LICMFlags, OptimizationRemarkEmitter *ORE = nullptr); diff --git a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h index f827ffd3e676aca..68c97c8a1a5292d 100644 --- a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h +++ b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h @@ -20,6 +20,7 @@ template class ArrayRef; class AllocaInst; class DominatorTree; class AssumptionCache; +class TaskInfo; /// Return true if this alloca is legal for promotion. /// @@ -37,7 +38,7 @@ bool isAllocaPromotable(const AllocaInst *AI); /// the same function. /// void PromoteMemToReg(ArrayRef Allocas, DominatorTree &DT, - AssumptionCache *AC = nullptr); + AssumptionCache *AC = nullptr, TaskInfo *TI = nullptr); } // End llvm namespace diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h index 29d96a0ab6bf5b7..4f38c4c08b4224b 100644 --- a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h +++ b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h @@ -55,6 +55,9 @@ class SSAUpdater { /// the vector. SmallVectorImpl *InsertedPHIs; + /// This keeps track of which values are defined in detached blocks. + void *VID = nullptr; + public: /// If InsertedPHIs is specified, it will be filled /// in with all PHI Nodes created by rewriting. @@ -107,6 +110,8 @@ class SSAUpdater { /// merge the appropriate values, and this value isn't live out of the block. Value *GetValueInMiddleOfBlock(BasicBlock *BB); + bool GetValueIsDetachedInBlock(BasicBlock *BB); + /// Rewrite a use of the symbolic value. /// /// This handles PHI nodes, which use their value in the corresponding diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h index 28ff6c4c7927d8c..a80a52ac7c7480b 100644 --- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h +++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h @@ -65,6 +65,9 @@ class SSAUpdaterImpl { // Marker for existing PHIs that match. PhiT *PHITag = nullptr; + // Flag to indicate that the AvailableVal would be used after a Reattach. + bool DetachedUse = false; + BBInfo(BlkT *ThisBB, ValT V) : BB(ThisBB), AvailableVal(V), DefBB(V ? this : nullptr) {} }; @@ -75,6 +78,10 @@ class SSAUpdaterImpl { SmallVectorImpl *InsertedPHIs; + using ValIsDetachedTy = DenseMap; + + ValIsDetachedTy *ValIsDetached; + using BlockListTy = SmallVectorImpl; using BBMapTy = DenseMap; @@ -83,8 +90,9 @@ class SSAUpdaterImpl { public: explicit SSAUpdaterImpl(UpdaterT *U, AvailableValsTy *A, - SmallVectorImpl *Ins) : - Updater(U), AvailableVals(A), InsertedPHIs(Ins) {} + SmallVectorImpl *Ins, + ValIsDetachedTy *D = nullptr) : + Updater(U), AvailableVals(A), InsertedPHIs(Ins), ValIsDetached(D) {} /// GetValue - Check to see if AvailableVals has an entry for the specified /// BB and if so, return it. If not, construct SSA form by first @@ -375,6 +383,10 @@ class SSAUpdaterImpl { (*AvailableVals)[Info->BB] = PHI; } + // Set of blocks with detached values that would be used except + // for Reattach. + SmallVector DetachedValBlocks; + // Now go back through the worklist in reverse order to fill in the // arguments for any new PHIs added in the forward traversal. for (typename BlockListTy::reverse_iterator I = BlockList->rbegin(), @@ -393,14 +405,47 @@ class SSAUpdaterImpl { if (!PHI) continue; + // Resolve detach and reattach predecessor information. A reattach + // predecessor should use the same available value as its corresponding + // detach. If a reattach predecessor does not have the same available + // value as its corresponding detach predecessor, note the use of a + // detached value. + SmallVector DetachPredInfo, ReattachPredInfo; // Iterate through the block's predecessors. for (unsigned p = 0; p != Info->NumPreds; ++p) { BBInfo *PredInfo = Info->Preds[p]; BlkT *Pred = PredInfo->BB; + if (Traits::BlockReattaches(Pred, Updater)) { + ReattachPredInfo.push_back(PredInfo); + continue; + } // Skip to the nearest preceding definition. if (PredInfo->DefBB != PredInfo) PredInfo = PredInfo->DefBB; Traits::AddPHIOperand(PHI, PredInfo->AvailableVal, Pred); + if (Traits::BlockDetaches(Pred, Updater)) + DetachPredInfo.push_back(PredInfo); + } + if (!ReattachPredInfo.empty()) { + assert(!DetachPredInfo.empty() && "Reattach predecessor found with no " + "corresponding Detach predecessor."); + for (BBInfo *RPInfo : ReattachPredInfo) { + bool FoundMatchingDetach = false; + for (BBInfo *DPInfo : DetachPredInfo) { + if (RPInfo->DefBB->BB == DPInfo->DefBB->BB) { + // Available value from predecessor through a reattach is the + // same as that for the corresponding detach. + Traits::AddPHIOperand(PHI, DPInfo->AvailableVal, RPInfo->BB); + FoundMatchingDetach = true; + break; + } + } + if (!FoundMatchingDetach) { + DetachedValBlocks.push_back(Info); + Traits::AddPHIOperand(PHI, Traits::GetUndefVal(RPInfo->BB, Updater), + RPInfo->BB); + } + } } LLVM_DEBUG(dbgs() << " Inserted PHI: " << *PHI << "\n"); @@ -408,6 +453,9 @@ class SSAUpdaterImpl { // If the client wants to know about all new instructions, tell it. if (InsertedPHIs) InsertedPHIs->push_back(PHI); } + + // Mark any definitions that are detached from their use. + MarkDetachedDefs(&DetachedValBlocks); } /// FindExistingPHI - Look through the PHI nodes in a block to see if any of @@ -441,7 +489,21 @@ class SSAUpdaterImpl { for (typename Traits::PHI_iterator I = Traits::PHI_begin(PHI), E = Traits::PHI_end(PHI); I != E; ++I) { ValT IncomingVal = I.getIncomingValue(); - BBInfo *PredInfo = BBMap[I.getIncomingBlock()]; + BlkT *BB = I.getIncomingBlock(); + + // Replace a reattach predecessor with the corresponding + // detach predecessor. + // + // TODO: Remove the implicit assumption here that each basic + // block has at most one reattach predecessor. + if (Traits::BlockReattaches(BB, Updater)) + for (typename Traits::PHI_iterator PI = Traits::PHI_begin(PHI), + PE = Traits::PHI_end(PHI); PI != PE; ++PI) + if (Traits::BlockDetaches(PI.getIncomingBlock(), Updater)) { + BB = PI.getIncomingBlock(); + break; + } + BBInfo *PredInfo = BBMap[BB]; // Skip to the nearest preceding definition. if (PredInfo->DefBB != PredInfo) PredInfo = PredInfo->DefBB; @@ -484,6 +546,30 @@ class SSAUpdaterImpl { BBMap[BB]->AvailableVal = PHIVal; } } + + /// MarkDetachedDefs - Mark all definitions that reach the basic + /// blocks in WorkList as having detached uses. + void MarkDetachedDefs(SmallVector *WorkList) { + BBInfo *Info; + while (!WorkList->empty()) { + Info = WorkList->pop_back_val(); + Info->DetachedUse = true; + + ValT AvailableVal = Info->AvailableVal; + if (!AvailableVal) + continue; + + if (ValIsDetached) + (*ValIsDetached)[Info->BB] = true; + + if (Traits::ValueIsPHI(AvailableVal, Updater) || + Info->DefBB != Info) + for (unsigned p = 0; p != Info->NumPreds; ++p) + if (!Info->Preds[p]->DetachedUse) + WorkList->push_back(Info->Preds[p]); + } + } + }; } // end namespace llvm diff --git a/llvm/include/llvm/Transforms/Utils/TapirUtils.h b/llvm/include/llvm/Transforms/Utils/TapirUtils.h new file mode 100644 index 000000000000000..ce61e078d567393 --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/TapirUtils.h @@ -0,0 +1,382 @@ +//===- TapirUtils.h - Utility methods for Tapir ----------------*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file utility methods for handling code containing Tapir instructions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_TAPIRUTILS_H +#define LLVM_TRANSFORMS_UTILS_TAPIRUTILS_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +namespace llvm { + +class BasicBlock; +class DominatorTree; +class DomTreeUpdater; +class Loop; +class LoopInfo; +class MemorySSAUpdater; +class Spindle; +class Task; +class TaskInfo; + +// Check if the given instruction is an intrinsic with the specified ID. If a +// value \p V is specified, then additionally checks that the first argument of +// the intrinsic matches \p V. +bool isTapirIntrinsic(Intrinsic::ID ID, const Instruction *I, + const Value *V = nullptr); + +/// Returns true if the given instruction performs a detached.rethrow, false +/// otherwise. If \p SyncRegion is specified, then additionally checks that the +/// detached.rethrow uses \p SyncRegion. +bool isDetachedRethrow(const Instruction *I, const Value *SyncRegion = nullptr); + +/// Returns true if the given instruction performs a taskframe.resume, false +/// otherwise. If \p TaskFrame is specified, then additionally checks that the +/// taskframe.resume uses \p TaskFrame. +bool isTaskFrameResume(const Instruction *I, const Value *TaskFrame = nullptr); + +/// Returns true if the given basic block \p B is a placeholder successor of a +/// taskframe.resume or detached.rethrow. +bool isTapirPlaceholderSuccessor(const BasicBlock *B); + +/// Returns a taskframe.resume that uses the given taskframe, or nullptr if no +/// taskframe.resume uses this taskframe. +InvokeInst *getTaskFrameResume(Value *TaskFrame); + +/// Returns the unwind destination of a taskframe.resume that uses the given +/// taskframe, or nullptr if no such unwind destination exists. +BasicBlock *getTaskFrameResumeDest(Value *TaskFrame); + +/// Returns true if the given instruction is a sync.uwnind, false otherwise. If +/// \p SyncRegion is specified, then additionally checks that the sync.unwind +/// uses \p SyncRegion. +bool isSyncUnwind(const Instruction *I, const Value *SyncRegion = nullptr, + bool CheckForInvoke = false); + +/// Returns true if BasicBlock \p B is a placeholder successor, that is, it's +/// the immediate successor of only detached-rethrow and taskframe-resume +/// instructions. +bool isPlaceholderSuccessor(const BasicBlock *B); + +/// Returns true if the given basic block ends a taskframe, false otherwise. In +/// particular, this method checks if the penultimate instruction in the basic +/// block is a taskframe.end intrinsic call. If \p TaskFrame is specified, then +/// additionally checks that the taskframe.end uses \p TaskFrame. +bool endsTaskFrame(const BasicBlock *B, const Value *TaskFrame = nullptr); + +/// Returns the spindle containing the taskframe.create used by task \p T, or +/// the entry spindle of \p T if \p T has no such taskframe.create spindle. +Spindle *getTaskFrameForTask(Task *T); + +// Removes the given sync.unwind instruction, if it is dead. Returns true if +// the sync.unwind was removed, false otherwise. +bool removeDeadSyncUnwind(CallBase *SyncUnwind, DomTreeUpdater *DTU = nullptr); + +/// Returns true if the reattach instruction appears to match the given detach +/// instruction, false otherwise. +bool ReattachMatchesDetach(const ReattachInst *RI, const DetachInst *DI, + DominatorTree *DT = nullptr); + +/// Returns true of the given task itself contains a sync instruction. +bool taskContainsSync(const Task *T); + +/// Move static allocas in Block into Entry, which is assumed to dominate Block. +/// Leave lifetime markers behind in Block and before each instruction in +/// ExitPoints for those static allocas. Returns true if Block still contains +/// dynamic allocas, which cannot be moved. +bool MoveStaticAllocasInBlock(BasicBlock *Entry, BasicBlock *Block, + SmallVectorImpl &ExitPoints); + +/// Inline any taskframe.resume markers associated with the given taskframe. If +/// \p DT is provided, then it will be updated to reflect the CFG changes. +void InlineTaskFrameResumes(Value *TaskFrame, DominatorTree *DT = nullptr); + +/// Clone exception-handling blocks EHBlocksToClone, with predecessors +/// EHBlockPreds in a given task. Updates EHBlockPreds to point at the cloned +/// blocks. If the given pointers are non-null, updates blocks in *InlinedLPads +/// and *DetachedRethrows to refer to cloned blocks, and updates DT and LI to +/// reflect CFG updates. +void cloneEHBlocks(Function *F, SmallVectorImpl &EHBlocksToClone, + SmallPtrSetImpl &EHBlockPreds, + const char *Suffix, + SmallPtrSetImpl *InlinedLPads, + SmallVectorImpl *DetachedRethrows, + DominatorTree *DT = nullptr, LoopInfo *LI = nullptr); + +/// Serialize the detach DI. \p ParentEntry should be the entry block of the +/// task that contains DI. \p Reattaches, \p InlinedLPads, and \p +/// DetachedRethrows identify the reattaches, landing pads, and detached +/// rethrows in the task DI spawns that need special handling during +/// serialization. If \p DT is provided, then it will be updated to reflect the +/// CFG changes. +void SerializeDetach(DetachInst *DI, BasicBlock *ParentEntry, + BasicBlock *EHContinue, Value *LPadValInEHContinue, + SmallVectorImpl &Reattaches, + SmallVectorImpl *EHBlocksToClone, + SmallPtrSetImpl *EHBlockPreds, + SmallPtrSetImpl *InlinedLPads, + SmallVectorImpl *DetachedRethrows, + bool ReplaceWithTaskFrame = false, + DominatorTree *DT = nullptr, LoopInfo *LI = nullptr); + +/// Analyze a task T for serialization. Gets the reattaches, landing pads, and +/// detached rethrows that need special handling during serialization. +void AnalyzeTaskForSerialization( + Task *T, SmallVectorImpl &Reattaches, + SmallVectorImpl &EHBlocksToClone, + SmallPtrSetImpl &EHBlockPreds, + SmallPtrSetImpl &InlinedLPads, + SmallVectorImpl &DetachedRethrows); + +/// Serialize the detach DI that spawns task T. If \p DT is provided, then it +/// will be updated to reflect the CFG changes. +void SerializeDetach(DetachInst *DI, Task *T, bool ReplaceWithTaskFrame = false, + DominatorTree *DT = nullptr); + +/// Get the entry basic block to the detached context that contains +/// the specified block. +const BasicBlock *GetDetachedCtx(const BasicBlock *BB); +BasicBlock *GetDetachedCtx(BasicBlock *BB); + +// Returns true if the function may not be synced at the point of the given +// basic block, false otherwise. This function does a simple depth-first +// traversal of the CFG, and as such, produces a conservative result. +bool mayBeUnsynced(const BasicBlock *BB); + +/// isDetachContinueEdge - Return true if the edge from terminator instruction +/// TI to successor basic block Succ is a detach-continue edge. +bool isDetachContinueEdge(const Instruction *TI, const BasicBlock *Succ); + +/// isCriticalContinueEdge - Return true if the specified edge is a critical +/// detach-continue edge. Critical detach-continue edges are critical edges - +/// from a block with multiple successors to a block with multiple predecessors +/// - even after ignoring all reattach edges. +bool isCriticalContinueEdge(const Instruction *TI, unsigned SuccNum); + +/// GetDetachedCFG - Get the set of basic blocks in the CFG of the parallel task +/// spawned by detach instruction DI. The CFG will include the +/// exception-handling blocks that are separately identified in EHBlocks, which +/// might not be unique to the task. TaskReturns will store the set of basic +/// blocks that terminate the CFG of the parallel task. +void GetDetachedCFG(const DetachInst &DI, const DominatorTree &DT, + SmallPtrSetImpl &TaskBlocks, + SmallPtrSetImpl &EHBlocks, + SmallPtrSetImpl &TaskReturns); + +/// canDetach - Return true if the given function can perform a detach, false +/// otherwise. +bool canDetach(const Function *F); + +/// getDetachUnwindPHIUses - Collect all PHI nodes that directly or indirectly +/// use the landing pad for the unwind destination of detach DI. +void getDetachUnwindPHIUses(DetachInst *DI, + SmallPtrSetImpl &UnwindPHIs); + +/// getTaskFrameUsed - Return the taskframe used in the given detached block. +Value *getTaskFrameUsed(BasicBlock *Detached); + +/// splitTaskFrameCreateBlocks - Split basic blocks in function F at +/// taskframe.create intrinsics. Returns true if anything changed, false +/// otherwise. +bool splitTaskFrameCreateBlocks(Function &F, DominatorTree *DT = nullptr, + TaskInfo *TI = nullptr, LoopInfo *LI = nullptr, + MemorySSAUpdater *MSSAU = nullptr); + +/// taskFrameContains - Returns true if the given basic block \p B is contained +/// within the taskframe \p TF. +bool taskFrameContains(const Spindle *TF, const BasicBlock *B, + const TaskInfo &TI); + +/// taskFrameEncloses - Returns true if the given basic block \p B is enclosed +/// within the taskframe \p TF. +bool taskFrameEncloses(const Spindle *TF, const BasicBlock *B, + const TaskInfo &TI); + +/// fixupTaskFrameExternalUses - Fix any uses of variables defined in +/// taskframes, but outside of tasks themselves. For each such variable, insert +/// a memory allocation in the parent frame, add a store to that memory in the +/// taskframe, and modify external uses to use the value in that memory loaded +/// at the tasks continuation. +void fixupTaskFrameExternalUses(Spindle *TF, const TaskInfo &TI, + const DominatorTree &DT); + +/// FindTaskFrameCreateInBlock - Return the taskframe.create intrinsic in \p BB, +/// or nullptr if no taskframe.create intrinsic exists in \p BB. If specified, +/// ignores TFToIgnore when scanning for a taskframe.create. +Instruction *FindTaskFrameCreateInBlock(BasicBlock *BB, + const Value *TFToIgnore = nullptr); + +/// CreateSubTaskUnwindEdge - Create a landingpad for the exit of a taskframe or +/// task. +BasicBlock *CreateSubTaskUnwindEdge(Intrinsic::ID TermFunc, Value *Token, + BasicBlock *UnwindEdge, + BasicBlock *Unreachable, + Instruction *ParentI); + +/// promoteCallsInTasksToInvokes - Traverse the control-flow graph of F to +/// convert calls to invokes, recursively traversing tasks and taskframes to +/// insert appropriate detached.rethrow and taskframe.resume terminators. +void promoteCallsInTasksToInvokes(Function &F, const Twine Name = "cleanup"); + +/// eraseTaskFrame - Remove the specified taskframe and all uses of it. The +/// given \p TaskFrame should correspond to a taskframe.create call. The +/// DominatorTree \p DT is updated to reflect changes to the CFG, if \p DT is +/// not null. +void eraseTaskFrame(Value *TaskFrame, DominatorTree *DT = nullptr); + +/// Utility class for getting and setting Tapir-related loop hints in the form +/// of loop metadata. +/// +/// This class keeps a number of loop annotations locally (as member variables) +/// and can, upon request, write them back as metadata on the loop. It will +/// initially scan the loop for existing metadata, and will update the local +/// values based on information in the loop. +class TapirLoopHints { +public: + enum SpawningStrategy { + ST_SEQ, + ST_DAC, + ST_END, + }; + +private: + enum HintKind { HK_STRATEGY, HK_GRAINSIZE }; + + /// Hint - associates name and validation with the hint value. + struct Hint { + const char *Name; + unsigned Value; // This may have to change for non-numeric values. + HintKind Kind; + + Hint(const char *Name, unsigned Value, HintKind Kind) + : Name(Name), Value(Value), Kind(Kind) {} + + bool validate(unsigned Val) const { + switch (Kind) { + case HK_STRATEGY: + return (Val < ST_END); + case HK_GRAINSIZE: + return true; + } + return false; + } + }; + + /// Spawning strategy + Hint Strategy; + /// Grainsize + Hint Grainsize; + + /// Return the loop metadata prefix. + static StringRef Prefix() { return "tapir.loop."; } + +public: + static std::string printStrategy(enum SpawningStrategy Strat) { + switch(Strat) { + case TapirLoopHints::ST_SEQ: + return "Spawn iterations sequentially"; + case TapirLoopHints::ST_DAC: + return "Use divide-and-conquer"; + case TapirLoopHints::ST_END: + return "Unknown"; + default: + llvm_unreachable("Unknown print strategy"); + } + } + + TapirLoopHints(const Loop *L) + : Strategy("spawn.strategy", ST_SEQ, HK_STRATEGY), + Grainsize("grainsize", 0, HK_GRAINSIZE), + TheLoop(L) { + // Populate values with existing loop metadata. + getHintsFromMetadata(); + } + + // /// Dumps all the hint information. + // std::string emitRemark() const { + // TapirLoopReport R; + // R << "Strategy = " << printStrategy(getStrategy()); + + // return R.str(); + // } + + enum SpawningStrategy getStrategy() const { + return (SpawningStrategy)Strategy.Value; + } + + unsigned getGrainsize() const { + return Grainsize.Value; + } + + /// Clear Tapir Hints metadata. + void clearHintsMetadata(); + + /// Mark the loop L as having no spawning strategy. + void clearStrategy() { + Strategy.Value = ST_SEQ; + Hint Hints[] = {Strategy}; + writeHintsToMetadata(Hints); + } + + void clearClonedLoopMetadata(ValueToValueMapTy &VMap) { + Hint ClearStrategy = Strategy; + ClearStrategy.Value = ST_SEQ; + Hint Hints[] = {ClearStrategy}; + writeHintsToClonedMetadata(Hints, VMap); + } + + void setAlreadyStripMined() { + Grainsize.Value = 1; + Hint Hints[] = {Grainsize}; + writeHintsToMetadata(Hints); + } + +private: + /// Find hints specified in the loop metadata and update local values. + void getHintsFromMetadata(); + + /// Checks string hint with one operand and set value if valid. + void setHint(StringRef Name, Metadata *Arg); + + /// Create a new hint from name / value pair. + MDNode *createHintMetadata(StringRef Name, unsigned V) const; + + /// Matches metadata with hint name. + bool matchesHintMetadataName(MDNode *Node, ArrayRef HintTypes) const; + + /// Sets current hints into loop metadata, keeping other values intact. + void writeHintsToMetadata(ArrayRef HintTypes); + + /// Sets hints into cloned loop metadata, keeping other values intact. + void writeHintsToClonedMetadata(ArrayRef HintTypes, + ValueToValueMapTy &VMap); + + /// The loop these hints belong to. + const Loop *TheLoop; +}; + +/// Returns true if Tapir-loop hints require loop outlining during lowering. +bool hintsDemandOutlining(const TapirLoopHints &Hints); + +/// Create a new Loop MDNode by copying non-Tapir metadata from OrigLoopID. +MDNode *CopyNonTapirLoopMetadata(MDNode *LoopID, MDNode *OrigLoopID); + +/// Examine a given loop to determine if it is a Tapir loop that can and should +/// be processed. Returns the Task that encodes the loop body if so, or nullptr +/// if not. +Task *getTaskIfTapirLoop(const Loop *L, TaskInfo *TI); + +} // End llvm namespace + +#endif diff --git a/llvm/include/llvm/Transforms/Utils/TaskCanonicalize.h b/llvm/include/llvm/Transforms/Utils/TaskCanonicalize.h new file mode 100644 index 000000000000000..a2c219b471a0c13 --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/TaskCanonicalize.h @@ -0,0 +1,28 @@ +//===- TaskCanonicalize.h - Tapir task canonicalization pass -*- C++ -*----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass canonicalizes Tapir tasks. In particular, this pass splits blocks +// at taskframe.create intrinsics. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_UTILS_TASKCANONICALIZE_H +#define LLVM_TRANSFORMS_UTILS_TASKCANONICALIZE_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +/// This pass is responsible for Tapir task simplification. +class TaskCanonicalizePass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_UTILS_TASKCANONICALIZE_H diff --git a/llvm/include/llvm/Transforms/Utils/TaskSimplify.h b/llvm/include/llvm/Transforms/Utils/TaskSimplify.h new file mode 100644 index 000000000000000..681af4e07ea4d8f --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/TaskSimplify.h @@ -0,0 +1,41 @@ +//===- TaskSimplify.h - Tapir task simplification pass -*- C++ -*----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass performs several transformations to simplify Tapir tasks. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_UTILS_TASKSIMPLIFY_H +#define LLVM_TRANSFORMS_UTILS_TASKSIMPLIFY_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class DominatorTree; +struct MaybeParallelTasks; +class Task; +class TaskInfo; + +/// This pass is responsible for Tapir task simplification. +class TaskSimplifyPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +/// Simplify syncs in the specified task T. +bool simplifySyncs(Task *T, MaybeParallelTasks &MPTasks); + +/// Simplify the specified task T. +bool simplifyTask(Task *T); + +/// Simplify the taskframes analyzed by TapirTaskInfo TI. +bool simplifyTaskFrames(TaskInfo &TI, DominatorTree &DT); + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_UTILS_TASKSIMPLIFY_H diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h index 797c082333a76c8..76b3a6792b78008 100644 --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -35,6 +35,7 @@ class ProfileSummaryInfo; class OptimizationRemarkEmitter; class ScalarEvolution; class StringRef; +class TaskInfo; class Value; using NewLoopsMap = SmallDenseMap; @@ -79,7 +80,7 @@ struct UnrollLoopOptions { LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, - AssumptionCache *AC, + AssumptionCache *AC, TaskInfo *TI, const llvm::TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, Loop **RemainderLoop = nullptr, diff --git a/llvm/include/module.modulemap b/llvm/include/module.modulemap index b00da6d7cd28c78..5638d4ea8b3ad80 100644 --- a/llvm/include/module.modulemap +++ b/llvm/include/module.modulemap @@ -12,6 +12,7 @@ module LLVM_Analysis { textual header "llvm/Analysis/ScalarFuncs.def" textual header "llvm/Analysis/TargetLibraryInfo.def" textual header "llvm/Analysis/VecFuncs.def" + textual header "llvm/Analysis/TapirTargetFuncs.def" } module LLVM_AsmParser { diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index 9cdb315b6088f37..22e9da0334fc180 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -23,10 +23,13 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/DataRaceFreeAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ObjCARCAliasAnalysis.h" @@ -38,6 +41,7 @@ #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" @@ -110,6 +114,14 @@ AliasResult AAResults::alias(const MemoryLocation &LocA, return alias(LocA, LocB, AAQIP, nullptr); } +AliasResult AAResults::alias(const MemoryLocation &LocA, + const MemoryLocation &LocB, + bool AssumeSameSpindle) { + SimpleAAQueryInfo AAQIP(*this); + AAQIP.AssumeSameSpindle = AssumeSameSpindle; + return alias(LocA, LocB, AAQIP, nullptr); +} + AliasResult AAResults::alias(const MemoryLocation &LocA, const MemoryLocation &LocB, AAQueryInfo &AAQI, const Instruction *CtxI) { @@ -189,6 +201,37 @@ ModRefInfo AAResults::getModRefInfo(const Instruction *I, return getModRefInfo(I, Call2, AAQIP); } +ModRefInfo AAResults::getModRefInfo(const Instruction *I, + const CallBase *Call2, + bool AssumeSameSpindle) { + SimpleAAQueryInfo AAQIP(*this); + AAQIP.AssumeSameSpindle = AssumeSameSpindle; + return getModRefInfo(I, Call2, AAQIP); +} + +/// Returns true if the given instruction performs a detached rethrow, false +/// otherwise. +static bool isDetachedRethrow(const Instruction *I, + const Value *SyncRegion = nullptr) { + if (const InvokeInst *II = dyn_cast(I)) + if (const Function *Called = II->getCalledFunction()) + if (Intrinsic::detached_rethrow == Called->getIntrinsicID()) + if (!SyncRegion || (SyncRegion == II->getArgOperand(0))) + return true; + return false; +} + +static bool taskTerminator(const Instruction *T, const Value *SyncRegion) { + if (const ReattachInst *RI = dyn_cast(T)) + if (SyncRegion == RI->getSyncRegion()) + return true; + + if (isDetachedRethrow(T, SyncRegion)) + return true; + + return false; +} + ModRefInfo AAResults::getModRefInfo(const Instruction *I, const CallBase *Call2, AAQueryInfo &AAQI) { // We may have two calls. @@ -199,6 +242,47 @@ ModRefInfo AAResults::getModRefInfo(const Instruction *I, const CallBase *Call2, // If this is a fence, just return ModRef. if (I->isFenceLike()) return ModRefInfo::ModRef; + // If this is a detach, collect the ModRef info of the detached operations. + if (auto D = dyn_cast(I)) { + ModRefInfo Result = ModRefInfo::NoModRef; + SmallPtrSet Visited; + SmallVector WorkList; + WorkList.push_back(D->getDetached()); + while (!WorkList.empty()) { + BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + for (Instruction &DI : BB->instructionsWithoutDebug()) { + // Fail fast if we encounter an invalid CFG. + assert(!(D == &DI) && + "Detached CFG reaches its own Detach instruction."); + + if (&DI == Call2) + return ModRefInfo::NoModRef; + + // No need to recursively check nested syncs or detaches, as nested + // tasks are wholly contained in the detached sub-CFG we're iterating + // through. + if (isa(DI) || isa(DI)) + continue; + + if (isa(DI) || isa(DI) || isa(DI) || + isa(DI) || isa(DI) || + isa(DI) || isa(DI) || + DI.isFenceLike() || isa(DI)) + Result |= getModRefInfo(&DI, Call2, AAQI); + } + + // Add successors + const Instruction *T = BB->getTerminator(); + if (taskTerminator(T, D->getSyncRegion())) + continue; + for (unsigned idx = 0, max = T->getNumSuccessors(); idx < max; ++idx) + WorkList.push_back(T->getSuccessor(idx)); + } + return Result; + } // Otherwise, check if the call modifies or references the // location this memory access defines. The best we can say // is that if the call references what this instruction @@ -405,6 +489,79 @@ MemoryEffects AAResults::getMemoryEffects(const Function *F) { return Result; } +MemoryEffects AAResults::getMemoryEffects(const DetachInst *D, + AAQueryInfo &AAQI) { + MemoryEffects Result = MemoryEffects::none(); + SmallPtrSet Visited; + SmallVector WorkList; + WorkList.push_back(D->getDetached()); + while (!WorkList.empty()) { + const BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + for (const Instruction &I : *BB) { + // Fail fast if we encounter an invalid CFG. + assert(!(D == &I) && + "Invalid CFG found: Detached CFG reaches its own Detach."); + + if (const auto *CS = dyn_cast(&I)) + Result |= getMemoryEffects(CS, AAQI); + + // Early-exit the moment we reach the top of the lattice. + if (Result == MemoryEffects::unknown()) + return Result; + } + + // Add successors + const Instruction *T = BB->getTerminator(); + if (taskTerminator(T, D->getSyncRegion())) + continue; + for (unsigned idx = 0, max = T->getNumSuccessors(); idx < max; ++idx) + WorkList.push_back(T->getSuccessor(idx)); + } + + return Result; +} + +MemoryEffects AAResults::getMemoryEffects(const SyncInst *S, + AAQueryInfo &AAQI) { + MemoryEffects Result = MemoryEffects::none(); + SmallPtrSet Visited; + SmallVector WorkList; + WorkList.push_back(S->getParent()); + while (!WorkList.empty()) { + const BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + if (const DetachInst *D = dyn_cast(BB->getTerminator())) + Result |= getMemoryEffects(D, AAQI); + + // Early-exit the moment we reach the top of the lattice. + if (Result == MemoryEffects::unknown()) + return Result; + + // Add predecessors + for (const BasicBlock *Pred : predecessors(BB)) { + const Instruction *PT = Pred->getTerminator(); + // Ignore reattached predecessors and predecessors that end in syncs, + // because this sync does not wait on those predecessors. + if (isa(PT) || isa(PT) || isDetachedRethrow(PT)) + continue; + + // If this block is detached, ignore the predecessor that detaches it. + if (const DetachInst *Det = dyn_cast(PT)) + if (Det->getDetached() == BB) + continue; + + WorkList.push_back(Pred); + } + } + + return Result; +} + raw_ostream &llvm::operator<<(raw_ostream &OS, AliasResult AR) { switch (AR) { case AliasResult::NoAlias: @@ -607,6 +764,8 @@ ModRefInfo AAResults::getModRefInfo(const Instruction *I, if (OptLoc == std::nullopt) { if (const auto *Call = dyn_cast(I)) return getMemoryEffects(Call, AAQIP).getModRef(); + if (const auto *D = dyn_cast(I)) + return getMemoryEffects(D, AAQIP).getModRef(); } const MemoryLocation &Loc = OptLoc.value_or(MemoryLocation()); @@ -632,6 +791,10 @@ ModRefInfo AAResults::getModRefInfo(const Instruction *I, return getModRefInfo((const CatchPadInst *)I, Loc, AAQIP); case Instruction::CatchRet: return getModRefInfo((const CatchReturnInst *)I, Loc, AAQIP); + case Instruction::Detach: + return getModRefInfo((const DetachInst *)I, Loc, AAQIP); + case Instruction::Sync: + return getModRefInfo((const SyncInst *)I, Loc, AAQIP); default: assert(!I->mayReadOrWriteMemory() && "Unhandled memory access instruction!"); @@ -639,6 +802,89 @@ ModRefInfo AAResults::getModRefInfo(const Instruction *I, } } +ModRefInfo AAResults::getModRefInfo(const DetachInst *D, + const MemoryLocation &Loc, + AAQueryInfo &AAQI) { + ModRefInfo Result = ModRefInfo::NoModRef; + SmallPtrSet Visited; + SmallVector WorkList; + WorkList.push_back(D->getDetached()); + while (!WorkList.empty()) { + const BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + for (const Instruction &I : BB->instructionsWithoutDebug()) { + // Fail fast if we encounter an invalid CFG. + assert(!(D == &I) && + "Invalid CFG found: Detached CFG reaches its own Detach."); + + // No need to recursively check nested syncs or detaches, as nested tasks + // are wholly contained in the detached sub-CFG we're iterating through. + if (isa(I) || isa(I)) + continue; + + Result |= getModRefInfo(&I, Loc, AAQI); + + // Early-exit the moment we reach the top of the lattice. + if (isModAndRefSet(Result)) + return Result; + } + + // Add successors + const Instruction *T = BB->getTerminator(); + if (taskTerminator(T, D->getSyncRegion())) + continue; + for (const BasicBlock *Successor : successors(BB)) + WorkList.push_back(Successor); + } + + return Result; +} + +ModRefInfo AAResults::getModRefInfo(const SyncInst *S, + const MemoryLocation &Loc, + AAQueryInfo &AAQI) { + // If no memory location pointer is given, treat the sync like a fence. + if (!Loc.Ptr) + return ModRefInfo::ModRef; + + ModRefInfo Result = ModRefInfo::NoModRef; + SmallPtrSet Visited; + SmallVector WorkList; + WorkList.push_back(S->getParent()); + while(!WorkList.empty()) { + const BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + if (const DetachInst *D = dyn_cast(BB->getTerminator())) { + Result |= getModRefInfo(D, Loc, AAQI); + + // Early-exit the moment we reach the top of the lattice. + if (isModAndRefSet(Result)) + return Result; + } + + // Add predecessors + for (const BasicBlock *Pred : predecessors(BB)) { + const Instruction *PT = Pred->getTerminator(); + // Ignore reattached predecessors and predecessors that end in syncs, + // because this sync does not wait on those predecessors. + if (isa(PT) || isa(PT) || isDetachedRethrow(PT)) + continue; + // If this block is detached, ignore the predecessor that detaches it. + if (const DetachInst *Det = dyn_cast(PT)) + if (Det->getDetached() == BB) + continue; + + WorkList.push_back(Pred); + } + } + + return Result; +} + /// Return information about whether a particular call site modifies /// or reads the specified memory location \p MemLoc before instruction \p I /// in a BasicBlock. @@ -762,6 +1008,7 @@ char AAResultsWrapperPass::ID = 0; INITIALIZE_PASS_BEGIN(AAResultsWrapperPass, "aa", "Function Alias Analysis Results", false, true) INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DRFAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(ExternalAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) @@ -804,6 +1051,8 @@ bool AAResultsWrapperPass::runOnFunction(Function &F) { AAR->addAAResult(WrapperPass->getResult()); if (auto *WrapperPass = getAnalysisIfAvailable()) AAR->addAAResult(WrapperPass->getResult()); + if (auto *WrapperPass = getAnalysisIfAvailable()) + AAR->addAAResult(WrapperPass->getResult()); // If available, run an external AA providing callback over the results as // well. @@ -828,6 +1077,7 @@ void AAResultsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.addUsedIfAvailable(); AU.addUsedIfAvailable(); AU.addUsedIfAvailable(); + AU.addUsedIfAvailable(); AU.addUsedIfAvailable(); } @@ -844,6 +1094,12 @@ bool llvm::isNoAliasCall(const Value *V) { return false; } +bool llvm::isNoAliasCallIfInSameSpindle(const Value *V) { + if (isa(V)) + return false; + return isNoAliasCall(V); +} + static bool isNoAliasOrByValArgument(const Value *V) { if (const Argument *A = dyn_cast(V)) return A->hasNoAliasAttr() || A->hasByValAttr(); @@ -862,6 +1118,14 @@ bool llvm::isIdentifiedObject(const Value *V) { return false; } +bool llvm::isIdentifiedObjectIfInSameSpindle(const Value *V) { + if (isIdentifiedObject(V)) + return true; + if (isNoAliasCallIfInSameSpindle(V)) + return true; + return false; +} + bool llvm::isIdentifiedFunctionLocal(const Value *V) { return isa(V) || isNoAliasCall(V) || isNoAliasOrByValArgument(V); } diff --git a/llvm/lib/Analysis/AliasSetTracker.cpp b/llvm/lib/Analysis/AliasSetTracker.cpp index 32e545daaf22692..d89fa97946dd289 100644 --- a/llvm/lib/Analysis/AliasSetTracker.cpp +++ b/llvm/lib/Analysis/AliasSetTracker.cpp @@ -347,6 +347,14 @@ void AliasSetTracker::addUnknown(Instruction *Inst) { if (isa(Inst)) return; // Ignore DbgInfo Intrinsics. + // Check for invokes of detached.rethrow, taskframe.resume, or sync.unwind. + if (const InvokeInst *I = dyn_cast(Inst)) + if (const Function *Called = I->getCalledFunction()) + if (Intrinsic::detached_rethrow == Called->getIntrinsicID() || + Intrinsic::taskframe_resume == Called->getIntrinsicID() || + Intrinsic::sync_unwind == Called->getIntrinsicID()) + return; + if (auto *II = dyn_cast(Inst)) { // These intrinsics will show up as affecting memory, but they are just // markers. @@ -360,6 +368,12 @@ void AliasSetTracker::addUnknown(Instruction *Inst) { case Intrinsic::experimental_noalias_scope_decl: case Intrinsic::sideeffect: case Intrinsic::pseudoprobe: + case Intrinsic::syncregion_start: + case Intrinsic::taskframe_create: + case Intrinsic::taskframe_use: + case Intrinsic::taskframe_end: + case Intrinsic::taskframe_load_guard: + case Intrinsic::sync_unwind: return; } } diff --git a/llvm/lib/Analysis/Analysis.cpp b/llvm/lib/Analysis/Analysis.cpp index 11cc6cfccea6af8..a07e16bd1c3a69d 100644 --- a/llvm/lib/Analysis/Analysis.cpp +++ b/llvm/lib/Analysis/Analysis.cpp @@ -37,6 +37,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) { initializePostDomOnlyPrinterWrapperPassPass(Registry); initializeAAResultsWrapperPassPass(Registry); initializeGlobalsAAWrapperPassPass(Registry); + initializeDRFAAWrapperPassPass(Registry); initializeIVUsersWrapperPassPass(Registry); initializeIRSimilarityIdentifierWrapperPassPass(Registry); initializeLazyBranchProbabilityInfoPassPass(Registry); @@ -57,7 +58,9 @@ void llvm::initializeAnalysis(PassRegistry &Registry) { initializeScalarEvolutionWrapperPassPass(Registry); initializeStackSafetyGlobalInfoWrapperPassPass(Registry); initializeStackSafetyInfoWrapperPassPass(Registry); + initializeTapirRaceDetectWrapperPassPass(Registry); initializeTargetTransformInfoWrapperPassPass(Registry); + initializeTaskInfoWrapperPassPass(Registry); initializeTypeBasedAAWrapperPassPass(Registry); initializeScopedNoAliasAAWrapperPassPass(Registry); initializeLCSSAVerificationPassPass(Registry); diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index e474899fb548ec5..a050d07254aef40 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -1530,6 +1530,258 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize, return Alias; } +// Given that O1 != O2, return NoAlias if they can not alias. +static AliasResult UnderlyingNoAlias(const Value *O1, const Value *O2, + AAQueryInfo &AAQI) { + assert(O1 != O2 && "identical arguments to UnderlyingNoAlias"); + + // If V1/V2 point to two different objects, we know that we have no alias. + if (AAQI.AssumeSameSpindle) { + if (isIdentifiedObjectIfInSameSpindle(O1) && + isIdentifiedObjectIfInSameSpindle(O2)) + return AliasResult::NoAlias; + } else { + if (isIdentifiedObject(O1) && isIdentifiedObject(O2)) + return AliasResult::NoAlias; + } + + // Constant pointers can't alias with non-const isIdentifiedObject objects. + if ((isa(O1) && isIdentifiedObject(O2) && !isa(O2)) || + (isa(O2) && isIdentifiedObject(O1) && !isa(O1))) + return AliasResult::NoAlias; + + // Function arguments can't alias with things that are known to be + // unambigously identified at the function level. + if ((isa(O1) && isIdentifiedFunctionLocal(O2)) || + (isa(O2) && isIdentifiedFunctionLocal(O1))) + return AliasResult::NoAlias; + + // If one pointer is the result of a call/invoke or load and the other is a + // non-escaping local object within the same function, then we know the + // object couldn't escape to a point where the call could return it. + // + // Note that if the pointers are in different functions, there are a + // variety of complications. A call with a nocapture argument may still + // temporary store the nocapture argument's value in a temporary memory + // location if that memory location doesn't escape. Or it may pass a + // nocapture value to other functions as long as they don't capture it. + if (isEscapeSource(O1) && + AAQI.CI->isNotCapturedBeforeOrAt(O2, cast(O1))) + return AliasResult::NoAlias; + if (isEscapeSource(O2) && + AAQI.CI->isNotCapturedBeforeOrAt(O1, cast(O2))) + return AliasResult::NoAlias; + + return AliasResult::MayAlias; +} + +namespace { +// TODO: Consider moving this code to AliasAnalysis.h, to make it accessible to +// other alias analyses. +// TODO: TapirFnBehavior::View and TapirFnBehavior::Strand may be redundant. +enum class TapirFnBehavior : uint8_t { + None = 0, + Injective = 1, + Pure = 2, // including strand pure function in same strand + View = 4, + InjectiveOrPureOrView = Injective | Pure | View, + Strand = 8, // excluding strand pure function in same strand + Any = InjectiveOrPureOrView | Strand, +}; + +static inline bool noTapirFnBehavior(const TapirFnBehavior TFB) { + return (static_cast(TFB) & + static_cast(TapirFnBehavior::Any)) == + static_cast(TapirFnBehavior::None); +} +static inline bool isInjectiveSet(const TapirFnBehavior TFB) { + return (static_cast(TFB) & + static_cast(TapirFnBehavior::Injective)) == + static_cast(TapirFnBehavior::Injective); +} +static inline bool isPureSet(const TapirFnBehavior TFB) { + return (static_cast(TFB) & + static_cast(TapirFnBehavior::Pure)) == + static_cast(TapirFnBehavior::Pure); +} +static inline bool isViewSet(const TapirFnBehavior TFB) { + return (static_cast(TFB) & + static_cast(TapirFnBehavior::View)) == + static_cast(TapirFnBehavior::View); +} +static inline bool isInjectiveOrPureOrViewSet(const TapirFnBehavior TFB) { + return static_cast(TFB) & + static_cast(TapirFnBehavior::InjectiveOrPureOrView); +} +static inline bool isStrandSet(const TapirFnBehavior TFB) { + return (static_cast(TFB) & + static_cast(TapirFnBehavior::Strand)) == + static_cast(TapirFnBehavior::Strand); +} +static inline TapirFnBehavior setPure(const TapirFnBehavior TFB) { + return TapirFnBehavior(static_cast(TFB) | + static_cast(TapirFnBehavior::Pure)); +} +static inline TapirFnBehavior clearPure(const TapirFnBehavior TFB) { + return TapirFnBehavior(static_cast(TFB) & + ~static_cast(TapirFnBehavior::Pure)); +} +static inline TapirFnBehavior clearStrand(const TapirFnBehavior TFB) { + return TapirFnBehavior(static_cast(TFB) & + ~static_cast(TapirFnBehavior::Strand)); +} +static inline TapirFnBehavior unionTapirFnBehavior(const TapirFnBehavior TFB1, + const TapirFnBehavior TFB2) { + return TapirFnBehavior(static_cast(TFB1) | + static_cast(TFB2)); +} +static inline TapirFnBehavior +intersectTapirFnBehavior(const TapirFnBehavior TFB1, + const TapirFnBehavior TFB2) { + return TapirFnBehavior(static_cast(TFB1) & + static_cast(TFB2)); +} +} // namespace + +// Tapir/OpenCilk code has some simple optimization opportunities. +// 1. Some runtime functions are injections, i.e., they return nonaliasing +// pointers when given nonaliasing arguments. +// 2. Some runtime functions are pure, or pure within a region of execution, +// which means the return values MustAlias if the arguments are identical. +// 3. View lookups return a value that does not alias anything that the +// argument does not alias (for simplicity, this implies injective). +// 4. Token lookups return a value that does not alias any alloca or global. +static const Value *getRecognizedArgument(const Value *V, bool InSameSpindle, + const Value *&Fn, + TapirFnBehavior &Behavior) { + const CallInst *C = dyn_cast(V); + if (!C) + return nullptr; + unsigned NumOperands = C->getNumOperands(); + if (NumOperands != 2 && NumOperands != 5) + return nullptr; + + // Make TapirFnBehavior::Strand and TapirFnBehavior::Pure mutually exclusive. + if (isStrandSet(Behavior)) { + if (InSameSpindle) + Behavior = setPure(clearStrand(Behavior)); + else + Behavior = clearPure(Behavior); + } else if (C->doesNotAccessMemory() && C->doesNotThrow() && + C->hasFnAttr(Attribute::WillReturn)) { + Behavior = setPure(Behavior); + } + + if (noTapirFnBehavior(Behavior)) + return nullptr; + Fn = C->getCalledOperand(); + return C->getOperand(0); +} + +AliasResult +BasicAAResult::checkInjectiveArguments(const Value *V1, const Value *O1, + const Value *V2, const Value *O2, + AAQueryInfo &AAQI) { + // V1 and V2 are the original pointers stripped of casts + // O1 and O2 are the underlying objects stripped of GEP as well + + const Value *Fn1 = nullptr, *Fn2 = nullptr; + TapirFnBehavior Behavior1 = TapirFnBehavior::None, + Behavior2 = TapirFnBehavior::None; + bool InSameSpindle = AAQI.AssumeSameSpindle; + const Value *A1 = getRecognizedArgument(V1, InSameSpindle, Fn1, Behavior1); + const Value *A2 = getRecognizedArgument(V2, InSameSpindle, Fn2, Behavior2); + + if (!isInjectiveOrPureOrViewSet(Behavior1) && + !isInjectiveOrPureOrViewSet(Behavior2)) + return AliasResult::MayAlias; + + // At least one value is a call to an understood function + assert(A1 || A2); + assert(!!A1 == !!Fn1); + assert(!!A2 == !!Fn2); + + // Calls to two different functions can not be analyzed. + if (Fn1 && Fn2 && Fn1 != Fn2) + return AliasResult::MayAlias; + + // Pure functions return equal values given equal arguments. + AliasResult Equal = + isPureSet(intersectTapirFnBehavior(Behavior1, Behavior2)) ? + AliasResult::MustAlias : AliasResult::MayAlias; + + // This is for testing. The intended use is with pointer arguments. + if (A1 && A2 && isInjectiveSet(Behavior1)) { + if (const ConstantInt *I1 = dyn_cast(A1)) { + if (const ConstantInt *I2 = dyn_cast(A2)) + return I1->getValue() == I2->getValue() ? + Equal : AliasResult(AliasResult::NoAlias); + return AliasResult::MayAlias; + } + } + + bool Known1 = false, Known2 = false; + const Value *U1 = nullptr, *U2 = nullptr; + + if (A1) { + U1 = getUnderlyingObject(A1, MaxLookupSearchDepth); + Known1 = isIdentifiedObject(U1); + } + if (A2) { + U2 = getUnderlyingObject(A2, MaxLookupSearchDepth); + Known2 = isIdentifiedObject(U2); + } + + // Rules, in order: + // 1. Potentially unequal values based on the same object may alias. + // 2. View lookups do not alias allocas that do not alias the argument + if (!A1) { + if (!Known2) + return AliasResult::MayAlias; + if (O1 == U2) // 1 + return AliasResult::MayAlias; + if (isViewSet(Behavior2)) // 2 + return UnderlyingNoAlias(O1, U2, AAQI); + return AliasResult::MayAlias; + } + if (!A2) { + if (!Known1) + return AliasResult::MayAlias; + if (U1 == O2) // 1 + return AliasResult::MayAlias; + if (isViewSet(Behavior1)) // 2 + return UnderlyingNoAlias(U1, O2, AAQI); + return AliasResult::MayAlias; + } + + if (!isInjectiveSet(Behavior1)) + return AliasResult::MayAlias; + + // Two calls to the same function with the same value. + if (isValueEqualInPotentialCycles(A1, A2, AAQI)) + return Equal; + + // Two calls with different values based on the same object. + if (U1 == U2) { + // TODO: Currently the caller only cares whether the result is NoAlias. + // If the caller relied on partial overlap detection a function like + // void *f(void *p) { return p; } + // could not be declared injective. + BasicAAResult::DecomposedGEP DecompGEP1 = + DecomposeGEPExpression(A1, DL, &AC, DT); + BasicAAResult::DecomposedGEP DecompGEP2 = + DecomposeGEPExpression(A2, DL, &AC, DT); + if (DecompGEP1.VarIndices.empty() && DecompGEP2.VarIndices.empty() && + isValueEqualInPotentialCycles(DecompGEP1.Base, DecompGEP2.Base, AAQI)) + return DecompGEP1.Offset == DecompGEP2.Offset + ? Equal + : AliasResult(AliasResult::NoAlias); + return AliasResult::MayAlias; + } + + return UnderlyingNoAlias(U1, U2, AAQI); +} + /// Provides a bunch of ad-hoc rules to disambiguate in common cases, such as /// array references. AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size, @@ -1575,10 +1827,14 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size, if (!NullPointerIsDefined(&F, CPN->getType()->getAddressSpace())) return AliasResult::NoAlias; - if (O1 != O2) { - // If V1/V2 point to two different objects, we know that we have no alias. - if (isIdentifiedObject(O1) && isIdentifiedObject(O2)) - return AliasResult::NoAlias; + // If the call is an injection (distinct argument implies + // distinct return) some more optimization is possible. + AliasResult InjectiveResult = + checkInjectiveArguments(V1, O1, V2, O2, AAQI); + if (InjectiveResult == AliasResult::NoAlias) + return AliasResult::NoAlias; + else if (InjectiveResult == AliasResult::MustAlias) + return AliasResult::MayAlias; // Function arguments can't alias with things that are known to be // unambigously identified at the function level. @@ -1603,6 +1859,9 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size, return AliasResult::NoAlias; } + if (O1 != O2 && UnderlyingNoAlias(O1, O2, AAQI) == AliasResult::NoAlias) + return AliasResult::NoAlias; + // If the size of one access is larger than the entire object on the other // side, then we know such behavior is undefined and can assume no alias. bool NullIsValidLocation = NullPointerIsDefined(&F); diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index 74476cb5440c610..ced9f627abee56d 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -47,6 +47,7 @@ add_llvm_component_library(LLVMAnalysis CodeMetrics.cpp ConstantFolding.cpp CycleAnalysis.cpp + DataRaceFreeAliasAnalysis.cpp DDG.cpp DDGPrinter.cpp ConstraintSystem.cpp @@ -126,6 +127,8 @@ add_llvm_component_library(LLVMAnalysis StructuralHash.cpp SyntheticCountsUtils.cpp TFLiteUtils.cpp + TapirRaceDetect.cpp + TapirTaskInfo.cpp TargetLibraryInfo.cpp TargetTransformInfo.cpp TensorSpec.cpp @@ -139,6 +142,8 @@ add_llvm_component_library(LLVMAnalysis ValueLatticeUtils.cpp ValueTracking.cpp VectorUtils.cpp + VFABIDemangling.cpp + WorkSpanAnalysis.cpp ${GeneratedMLSources} ADDITIONAL_HEADER_DIRS diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp index a353842eb809c4a..ea4c7aece393584 100644 --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -283,6 +283,49 @@ UseCaptureKind llvm::DetermineUseCaptureKind( if (!I) return UseCaptureKind::MAY_CAPTURE; + if (ConstantExpr *CE = dyn_cast(I)) { + switch (CE->getOpcode()) { + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::Select: + // The original value is not captured via this if the new value isn't. + return UseCaptureKind::PASSTHROUGH; + case Instruction::ICmp: { + unsigned Idx = U.getOperandNo(); + unsigned OtherIdx = 1 - Idx; + if (auto *CPN = dyn_cast(CE->getOperand(OtherIdx))) { + // Don't count comparisons of a no-alias return value against null as + // captures. This allows us to ignore comparisons of malloc results + // with null, for example. + if (CPN->getType()->getAddressSpace() == 0) + if (isNoAliasCall(U.get()->stripPointerCasts())) + return UseCaptureKind::NO_CAPTURE; + if (!I->getFunction()->nullPointerIsDefined()) { + auto *O = I->getOperand(Idx)->stripPointerCastsSameRepresentation(); + // Comparing a dereferenceable_or_null pointer against null cannot + // lead to pointer escapes, because if it is not null it must be a + // valid (in-bounds) pointer. + const DataLayout &DL = I->getModule()->getDataLayout(); + if (IsDereferenceableOrNull && IsDereferenceableOrNull(O, DL)) + return UseCaptureKind::NO_CAPTURE; + } + } + // Comparison against value stored in global variable. Given the pointer + // does not escape, its value cannot be guessed and stored separately in a + // global variable. + auto *LI = dyn_cast(CE->getOperand(OtherIdx)); + if (LI && isa(LI->getPointerOperand())) + return UseCaptureKind::NO_CAPTURE; + // Otherwise, be conservative. There are crazy ways to capture pointers + // using comparisons. + return UseCaptureKind::MAY_CAPTURE; + } + default: + // Something else - be conservative and say it is captured. + return UseCaptureKind::MAY_CAPTURE; + } + } + switch (I->getOpcode()) { case Instruction::Call: case Instruction::Invoke: { diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp index ea67b526423bf5b..385eb62fe09103a 100644 --- a/llvm/lib/Analysis/CodeMetrics.cpp +++ b/llvm/lib/Analysis/CodeMetrics.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" @@ -129,7 +130,7 @@ static bool extendsConvergenceOutsideLoop(const Instruction &I, const Loop *L) { void CodeMetrics::analyzeBasicBlock( const BasicBlock *BB, const TargetTransformInfo &TTI, const SmallPtrSetImpl &EphValues, bool PrepareForLTO, - const Loop *L) { + const Loop *L, TargetLibraryInfo* TLI) { ++NumBlocks; InstructionCost NumInstsBeforeThisBB = NumInsts; for (const Instruction &I : *BB) { @@ -161,6 +162,13 @@ void CodeMetrics::analyzeBasicBlock( if (IsLoweredToCall) ++NumCalls; + + // Check for a call to a builtin function or a Tapir-target library + // function. + LibFunc LF; + if (TLI && (TLI->getLibFunc(*F, LF) || TLI->isTapirTargetLibFunc(*F))) + ++NumBuiltinCalls; + } else { // We don't want inline asm to count as a call - that would prevent loop // unrolling. The argument setup cost is still real, though. @@ -182,7 +190,12 @@ void CodeMetrics::analyzeBasicBlock( LLVM_DEBUG(dbgs() << I << "\n Cannot duplicate a token value used outside " "the current block (except convergence control).\n"); - notDuplicatable = true; + if (const IntrinsicInst *II = dyn_cast(&I)) { + if (Intrinsic::syncregion_start != II->getIntrinsicID()) + notDuplicatable = true; + } else { + notDuplicatable = true; + } } if (const CallBase *CB = dyn_cast(&I)) { diff --git a/llvm/lib/Analysis/DataRaceFreeAliasAnalysis.cpp b/llvm/lib/Analysis/DataRaceFreeAliasAnalysis.cpp new file mode 100644 index 000000000000000..87df8da99f364a5 --- /dev/null +++ b/llvm/lib/Analysis/DataRaceFreeAliasAnalysis.cpp @@ -0,0 +1,145 @@ +//===- DataRaceFreeAliasAnalysis.cpp - DRF-based Alias Analysis -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the DataRaceFreeAliasAnalysis pass, which implements alias +// analysis based on the assumption that a Tapir program is data-race free. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/DataRaceFreeAliasAnalysis.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/IR/Instruction.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +#define DEBUG_TYPE "drf-aa-result" + +cl::opt llvm::EnableDRFAA( + "enable-drf-aa", cl::init(false), cl::Hidden, + cl::desc("Enable AA based on the data-race-free assumption " + "(default = off)")); + +bool DRFAAResult::invalidate(Function &Fn, const PreservedAnalyses &PA, + FunctionAnalysisManager::Invalidator &Inv) { + // We don't care if this analysis itself is preserved, it has no state. But we + // need to check that the analyses it depends on have been. + if (Inv.invalidate(Fn, PA)) + return true; + + // Otherwise this analysis result remains valid. + return false; +} + +#ifndef NDEBUG +static const Function *getParent(const Value *V) { + if (const Instruction *inst = dyn_cast(V)) { + if (!inst->getParent()) + return nullptr; + return inst->getParent()->getParent(); + } + + if (const Argument *arg = dyn_cast(V)) + return arg->getParent(); + + return nullptr; +} + +static bool notDifferentParent(const Value *O1, const Value *O2) { + + const Function *F1 = getParent(O1); + const Function *F2 = getParent(O2); + + return !F1 || !F2 || F1 == F2; +} +#endif + +AliasResult DRFAAResult::alias(const MemoryLocation &LocA, + const MemoryLocation &LocB, AAQueryInfo &AAQI, + const Instruction *CtxI) { + if (!EnableDRFAA) + return AAResultBase::alias(LocA, LocB, AAQI, CtxI); + + LLVM_DEBUG(dbgs() << "DRFAA:\n\tLocA.Ptr = " << *LocA.Ptr + << "\n\tLocB.Ptr = " << *LocB.Ptr << "\n"); + assert(notDifferentParent(LocA.Ptr, LocB.Ptr) && + "DRFAliasAnalysis doesn't support interprocedural queries."); + + if (const Instruction *AddrA = dyn_cast(LocA.Ptr)) + if (const Instruction *AddrB = dyn_cast(LocB.Ptr)) + if (TI.mayHappenInParallel(AddrA->getParent(), AddrB->getParent())) + return AliasResult::NoAlias; + return AAResultBase::alias(LocA, LocB, AAQI, CtxI); +} + +ModRefInfo DRFAAResult::getModRefInfo(const CallBase *Call, + const MemoryLocation &Loc, + AAQueryInfo &AAQI) { + if (!EnableDRFAA) + return AAResultBase::getModRefInfo(Call, Loc, AAQI); + + LLVM_DEBUG(dbgs() << "DRFAA:getModRefInfo(Call, Loc)\n"); + assert(notDifferentParent(Call, Loc.Ptr) && + "DRFAliasAnalysis doesn't support interprocedural queries."); + + if (const Instruction *Addr = dyn_cast(Loc.Ptr)) + if (TI.mayHappenInParallel(Call->getParent(), Addr->getParent())) + return ModRefInfo::NoModRef; + + return AAResultBase::getModRefInfo(Call, Loc, AAQI); +} + +ModRefInfo DRFAAResult::getModRefInfo(const CallBase *Call1, + const CallBase *Call2, + AAQueryInfo &AAQI) { + if (!EnableDRFAA) + return AAResultBase::getModRefInfo(Call1, Call2, AAQI); + + LLVM_DEBUG(dbgs() << "DRFAA:getModRefInfo(Call1, Call2)\n"); + + if (TI.mayHappenInParallel(Call1->getParent(), Call2->getParent())) + return ModRefInfo::NoModRef; + + return AAResultBase::getModRefInfo(Call1, Call2, AAQI); +} + +AnalysisKey DRFAA::Key; + +DRFAAResult DRFAA::run(Function &F, FunctionAnalysisManager &AM) { + return DRFAAResult(AM.getResult(F)); +} + +char DRFAAWrapperPass::ID = 0; +INITIALIZE_PASS_BEGIN(DRFAAWrapperPass, "drf-aa", + "DRF-based Alias Analysis", false, true) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) +INITIALIZE_PASS_END(DRFAAWrapperPass, "drf-aa", + "DRF-based Alias Analysis", false, true) + +FunctionPass *llvm::createDRFAAWrapperPass() { + return new DRFAAWrapperPass(); +} + +DRFAAWrapperPass::DRFAAWrapperPass() : FunctionPass(ID) { + initializeDRFAAWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +bool DRFAAWrapperPass::runOnFunction(Function &F) { + Result.reset( + new DRFAAResult(getAnalysis().getTaskInfo())); + return false; +} + +void DRFAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired(); +} diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 8e44d548cb56f26..9deaa07dc43b29a 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -54,6 +54,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Delinearization.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" @@ -376,6 +377,26 @@ bool FullDependence::isSplitable(unsigned Level) const { } +//===----------------------------------------------------------------------===// +// GeneralAccess methods + +raw_ostream &llvm::operator<<(raw_ostream &OS, const GeneralAccess &GA) { + if (!GA.isValid()) + OS << "(invalid GeneralAccess)"; + else { + OS << "(GA.I: " << *GA.I; + OS << ", GA.Loc: "; + if (!GA.Loc) + OS << "nullptr"; + else + OS << *GA.Loc->Ptr; + OS << ", GA.OperandNum: " << static_cast(GA.OperandNum); + OS << ", GA.ModRef: " << static_cast(GA.ModRef); + OS << ")"; + } + return OS; +} + //===----------------------------------------------------------------------===// // DependenceInfo::Constraint methods @@ -831,6 +852,7 @@ void DependenceInfo::establishNestingLevels(const Instruction *Src, } CommonLevels = SrcLevel; MaxLevels -= CommonLevels; + CommonLoop = SrcLoop; } @@ -1049,7 +1071,7 @@ DependenceInfo::classifyPair(const SCEV *Src, const Loop *SrcLoopNest, // we try simple subtraction, which seems to help in some cases // involving symbolics. bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X, - const SCEV *Y) const { + const SCEV *Y, const Loop *L) const { if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) { if ((isa(X) && @@ -1068,6 +1090,9 @@ bool DependenceInfo::isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *X, } if (SE->isKnownPredicate(Pred, X, Y)) return true; + if (L && isLoopInvariant(X, L) && isLoopInvariant(Y, L) && + isTrueAtLoopEntry(L, Pred, X, Y)) + return true; // If SE->isKnownPredicate can't prove the condition, // we try the brute-force approach of subtracting // and testing the difference. @@ -2804,10 +2829,10 @@ bool DependenceInfo::testBounds(unsigned char DirKind, unsigned Level, BoundInfo *Bound, const SCEV *Delta) const { Bound[Level].Direction = DirKind; if (const SCEV *LowerBound = getLowerBound(Bound)) - if (isKnownPredicate(CmpInst::ICMP_SGT, LowerBound, Delta)) + if (isKnownPredicate(CmpInst::ICMP_SGT, LowerBound, Delta, CommonLoop)) return false; if (const SCEV *UpperBound = getUpperBound(Bound)) - if (isKnownPredicate(CmpInst::ICMP_SGT, Delta, UpperBound)) + if (isKnownPredicate(CmpInst::ICMP_SGT, Delta, UpperBound, CommonLoop)) return false; return true; } @@ -2842,10 +2867,12 @@ void DependenceInfo::findBoundsALL(CoefficientInfo *A, CoefficientInfo *B, } else { // If the difference is 0, we won't need to know the number of iterations. - if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].NegPart, B[K].PosPart)) + if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].NegPart, B[K].PosPart, + CommonLoop)) Bound[K].Lower[Dependence::DVEntry::ALL] = SE->getZero(A[K].Coeff->getType()); - if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].PosPart, B[K].NegPart)) + if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].PosPart, B[K].NegPart, + CommonLoop)) Bound[K].Upper[Dependence::DVEntry::ALL] = SE->getZero(A[K].Coeff->getType()); } @@ -2980,14 +3007,43 @@ void DependenceInfo::findBoundsGT(CoefficientInfo *A, CoefficientInfo *B, } +// Returns true if predicate LHS `Pred` RHS is true at entry of L. +bool DependenceInfo::isTrueAtLoopEntry(const Loop *L, ICmpInst::Predicate Pred, + const SCEV *LHS, const SCEV *RHS) const { + return SE->isLoopEntryGuardedByCond(L, Pred, LHS, RHS); +} + + // X^+ = max(X, 0) const SCEV *DependenceInfo::getPositivePart(const SCEV *X) const { + if (CommonLoop) { + const SCEV *Zero = SE->getZero(X->getType()); + if (!SE->isLoopInvariant(X, CommonLoop)) + return SE->getSMaxExpr(X, SE->getZero(X->getType())); + if (isTrueAtLoopEntry(CommonLoop, CmpInst::ICMP_SGT, X, Zero) || + isTrueAtLoopEntry(CommonLoop, CmpInst::ICMP_SGT, + Zero, SE->getNegativeSCEV(X))) + return X; + if (isTrueAtLoopEntry(CommonLoop, CmpInst::ICMP_SGE, Zero, X)) + return Zero; + } return SE->getSMaxExpr(X, SE->getZero(X->getType())); } // X^- = min(X, 0) const SCEV *DependenceInfo::getNegativePart(const SCEV *X) const { + if (CommonLoop) { + const SCEV *Zero = SE->getZero(X->getType()); + if (!SE->isLoopInvariant(X, CommonLoop)) + return SE->getSMinExpr(X, SE->getZero(X->getType())); + if (isTrueAtLoopEntry(CommonLoop, CmpInst::ICMP_SGT, Zero, X) || + isTrueAtLoopEntry(CommonLoop, CmpInst::ICMP_SGT, + SE->getNegativeSCEV(X), Zero)) + return X; + if (isTrueAtLoopEntry(CommonLoop, CmpInst::ICMP_SGE, X, Zero)) + return Zero; + } return SE->getSMinExpr(X, SE->getZero(X->getType())); } @@ -3013,6 +3069,14 @@ DependenceInfo::collectCoeffInfo(const SCEV *Subscript, bool SrcFlag, CI[K].PosPart = getPositivePart(CI[K].Coeff); CI[K].NegPart = getNegativePart(CI[K].Coeff); CI[K].Iterations = collectUpperBound(L, Subscript->getType()); + if (const SCEVCastExpr *Cast = + dyn_cast(CI[K].PosPart)) { + auto *ReplSCEV = SE->getZeroExtendExpr(Cast->getOperand(), + Subscript->getType()); + if (CI[K].Coeff == CI[K].PosPart) + CI[K].Coeff = ReplSCEV; + CI[K].PosPart = ReplSCEV; + } Subscript = AddRec->getStart(); } Constant = Subscript; @@ -4199,3 +4263,678 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep, llvm_unreachable("somehow reached end of routine"); return nullptr; } + +static Value *getGeneralAccessPointerOperand(GeneralAccess *A) { + return const_cast(A->Loc->Ptr); +} + +static +const SCEV *getElementSize(GeneralAccess *A, ScalarEvolution *SE) { + Type *Ty = getGeneralAccessPointerOperand(A)->getType(); + Type *ETy = SE->getEffectiveSCEVType(PointerType::getUnqual(Ty)); + if (A->Loc) { + if (A->Loc->Size.hasValue()) + return SE->getConstant(ETy, A->Loc->Size.getValue()); + else + return SE->getCouldNotCompute(); + } else + return SE->getCouldNotCompute(); +} + +/// Check if we can delinearize the subscripts. If the SCEVs representing the +/// source and destination array references are recurrences on a nested loop, +/// this function flattens the nested recurrences into separate recurrences +/// for each loop level. +bool DependenceInfo::tryDelinearize(GeneralAccess *SrcA, GeneralAccess *DstA, + SmallVectorImpl &Pair) { + Value *SrcPtr = getGeneralAccessPointerOperand(SrcA); + Value *DstPtr = getGeneralAccessPointerOperand(DstA); + + Loop *SrcLoop = LI->getLoopFor(SrcA->I->getParent()); + Loop *DstLoop = LI->getLoopFor(DstA->I->getParent()); + + // Below code mimics the code in Delinearization.cpp + const SCEV *SrcAccessFn = SE->getSCEVAtScope(SrcPtr, SrcLoop); + const SCEV *DstAccessFn = SE->getSCEVAtScope(DstPtr, DstLoop); + const SCEVUnknown *SrcBase = + dyn_cast(SE->getPointerBase(SrcAccessFn)); + const SCEVUnknown *DstBase = + dyn_cast(SE->getPointerBase(DstAccessFn)); + + if (!SrcBase || !DstBase || SrcBase != DstBase) + return false; + + + SmallVector SrcSubscripts, DstSubscripts; + + if (!tryDelinearizeFixedSize(SrcA, DstA, SrcAccessFn, DstAccessFn, + SrcSubscripts, DstSubscripts) && + !tryDelinearizeParametricSize(SrcA, DstA, SrcAccessFn, DstAccessFn, + SrcSubscripts, DstSubscripts)) + return false; + + int Size = SrcSubscripts.size(); + LLVM_DEBUG({ + dbgs() << "\nSrcSubscripts: "; + for (int I = 0; I < Size; I++) + dbgs() << *SrcSubscripts[I]; + dbgs() << "\nDstSubscripts: "; + for (int I = 0; I < Size; I++) + dbgs() << *DstSubscripts[I]; + }); + + // The delinearization transforms a single-subscript MIV dependence test into + // a multi-subscript SIV dependence test that is easier to compute. So we + // resize Pair to contain as many pairs of subscripts as the delinearization + // has found, and then initialize the pairs following the delinearization. + Pair.resize(Size); + for (int I = 0; I < Size; ++I) { + Pair[I].Src = SrcSubscripts[I]; + Pair[I].Dst = DstSubscripts[I]; + unifySubscriptType(&Pair[I]); + } + + return true; +} + +static bool tryDelinearizeGAFixedSizeImpl( + ScalarEvolution *SE, GeneralAccess *GA, const SCEV *AccessFn, + SmallVectorImpl &Subscripts, SmallVectorImpl &Sizes) { + Value *SrcPtr = getGeneralAccessPointerOperand(GA); + + // Check the simple case where the array dimensions are fixed size. + auto *SrcGEP = dyn_cast(SrcPtr); + if (!SrcGEP) + return false; + + getIndexExpressionsFromGEP(*SE, SrcGEP, Subscripts, Sizes); + + // Check that the two size arrays are non-empty and equal in length and + // value. + // TODO: it would be better to let the caller to clear Subscripts, similar + // to how we handle Sizes. + if (Sizes.empty() || Subscripts.size() <= 1) { + Subscripts.clear(); + return false; + } + + // Check that for identical base pointers we do not miss index offsets + // that have been added before this GEP is applied. + Value *SrcBasePtr = SrcGEP->getOperand(0)->stripPointerCasts(); + const SCEVUnknown *SrcBase = + dyn_cast(SE->getPointerBase(AccessFn)); + if (!SrcBase || SrcBasePtr != SrcBase->getValue()) { + Subscripts.clear(); + return false; + } + + assert(Subscripts.size() == Sizes.size() + 1 && + "Expected equal number of entries in the list of size and " + "subscript."); + + return true; +} + +bool DependenceInfo::tryDelinearizeFixedSize( + GeneralAccess *SrcA, GeneralAccess *DstA, const SCEV *SrcAccessFn, + const SCEV *DstAccessFn, SmallVectorImpl &SrcSubscripts, + SmallVectorImpl &DstSubscripts) { + LLVM_DEBUG({ + const SCEVUnknown *SrcBase = + dyn_cast(SE->getPointerBase(SrcAccessFn)); + const SCEVUnknown *DstBase = + dyn_cast(SE->getPointerBase(DstAccessFn)); + assert(SrcBase && DstBase && SrcBase == DstBase && + "expected src and dst scev unknowns to be equal"); + }); + + SmallVector SrcSizes; + SmallVector DstSizes; + if (!tryDelinearizeGAFixedSizeImpl(SE, SrcA, SrcAccessFn, SrcSubscripts, + SrcSizes) || + !tryDelinearizeGAFixedSizeImpl(SE, DstA, DstAccessFn, DstSubscripts, + DstSizes)) + return false; + + // Check that the two size arrays are non-empty and equal in length and + // value. + if (SrcSizes.size() != DstSizes.size() || + !std::equal(SrcSizes.begin(), SrcSizes.end(), DstSizes.begin())) { + SrcSubscripts.clear(); + DstSubscripts.clear(); + return false; + } + + assert(SrcSubscripts.size() == DstSubscripts.size() && + "Expected equal number of entries in the list of SrcSubscripts and " + "DstSubscripts."); + + Value *SrcPtr = getGeneralAccessPointerOperand(SrcA); + Value *DstPtr = getGeneralAccessPointerOperand(DstA); + + // In general we cannot safely assume that the subscripts recovered from GEPs + // are in the range of values defined for their corresponding array + // dimensions. For example some C language usage/interpretation make it + // impossible to verify this at compile-time. As such we give up here unless + // we can assume that the subscripts do not overlap into neighboring + // dimensions and that the number of dimensions matches the number of + // subscripts being recovered. + if (!DisableDelinearizationChecks) { + auto AllIndiciesInRange = [&](SmallVector &DimensionSizes, + SmallVectorImpl &Subscripts, + Value *Ptr) { + size_t SSize = Subscripts.size(); + for (size_t I = 1; I < SSize; ++I) { + const SCEV *S = Subscripts[I]; + if (!isKnownNonNegative(S, Ptr)) + return false; + if (auto *SType = dyn_cast(S->getType())) { + const SCEV *Range = SE->getConstant( + ConstantInt::get(SType, DimensionSizes[I - 1], false)); + if (!isKnownLessThan(S, Range)) + return false; + } + } + return true; + }; + + if (!AllIndiciesInRange(SrcSizes, SrcSubscripts, SrcPtr) || + !AllIndiciesInRange(DstSizes, DstSubscripts, DstPtr)) { + SrcSubscripts.clear(); + DstSubscripts.clear(); + return false; + } + } + LLVM_DEBUG({ + dbgs() << "Delinearized subscripts of fixed-size array\n" + << "SrcGEP:" << *SrcPtr << "\n" + << "DstGEP:" << *DstPtr << "\n"; + }); + return true; +} + +bool DependenceInfo::tryDelinearizeParametricSize( + GeneralAccess *SrcA, GeneralAccess *DstA, const SCEV *SrcAccessFn, + const SCEV *DstAccessFn, SmallVectorImpl &SrcSubscripts, + SmallVectorImpl &DstSubscripts) { + + Value *SrcPtr = getGeneralAccessPointerOperand(SrcA); + Value *DstPtr = getGeneralAccessPointerOperand(DstA); + const SCEVUnknown *SrcBase = + dyn_cast(SE->getPointerBase(SrcAccessFn)); + const SCEVUnknown *DstBase = + dyn_cast(SE->getPointerBase(DstAccessFn)); + assert(SrcBase && DstBase && SrcBase == DstBase && + "expected src and dst scev unknowns to be equal"); + + const SCEV *ElementSize = getElementSize(SrcA, SE); + if (isa(ElementSize)) + return false; + if (ElementSize != getElementSize(DstA, SE)) + return false; + + const SCEV *SrcSCEV = SE->getMinusSCEV(SrcAccessFn, SrcBase); + const SCEV *DstSCEV = SE->getMinusSCEV(DstAccessFn, DstBase); + + const SCEVAddRecExpr *SrcAR = dyn_cast(SrcSCEV); + const SCEVAddRecExpr *DstAR = dyn_cast(DstSCEV); + if (!SrcAR || !DstAR || !SrcAR->isAffine() || !DstAR->isAffine()) + return false; + + // First step: collect parametric terms in both array references. + SmallVector Terms; + collectParametricTerms(*SE, SrcAR, Terms); + collectParametricTerms(*SE, DstAR, Terms); + + // Second step: find subscript sizes. + SmallVector Sizes; + findArrayDimensions(*SE, Terms, Sizes, ElementSize); + + // Third step: compute the access functions for each subscript. + computeAccessFunctions(*SE, SrcAR, SrcSubscripts, Sizes); + computeAccessFunctions(*SE, DstAR, DstSubscripts, Sizes); + + // Fail when there is only a subscript: that's a linearized access function. + if (SrcSubscripts.size() < 2 || DstSubscripts.size() < 2 || + SrcSubscripts.size() != DstSubscripts.size()) + return false; + + size_t Size = SrcSubscripts.size(); + + // Statically check that the array bounds are in-range. The first subscript we + // don't have a size for and it cannot overflow into another subscript, so is + // always safe. The others need to be 0 <= subscript[i] < bound, for both src + // and dst. + // FIXME: It may be better to record these sizes and add them as constraints + // to the dependency checks. + if (!DisableDelinearizationChecks) + for (size_t I = 1; I < Size; ++I) { + if (!isKnownNonNegative(SrcSubscripts[I], SrcPtr)) + return false; + + if (!isKnownLessThan(SrcSubscripts[I], Sizes[I - 1])) + return false; + + if (!isKnownNonNegative(DstSubscripts[I], DstPtr)) + return false; + + if (!isKnownLessThan(DstSubscripts[I], Sizes[I - 1])) + return false; + } + + return true; +} + +// depends - +// Returns NULL if there is no dependence. +// Otherwise, return a Dependence with as many details as possible. +// Corresponds to Section 3.1 in the paper +// +// Practical Dependence Testing +// Goff, Kennedy, Tseng +// PLDI 1991 +std::unique_ptr +DependenceInfo::depends(GeneralAccess *SrcA, GeneralAccess *DstA, + bool PossiblyLoopIndependent) { + if (SrcA == DstA) + PossiblyLoopIndependent = false; + + Instruction *Src = SrcA->I; + Instruction *Dst = DstA->I; + + if (!Src || !Dst) + // If we don't have a source or destination instruction, we don't have a + // dependence. + return nullptr; + + if (!(Src->mayReadOrWriteMemory() && Dst->mayReadOrWriteMemory())) + // if both instructions don't reference memory, there's no dependence + return nullptr; + + if (!SrcA->isValid() || !DstA->isValid()) { + LLVM_DEBUG(dbgs() << "could not interpret general accesses\n"); + return std::make_unique(Src, Dst); + } + + Value *SrcPtr = getGeneralAccessPointerOperand(SrcA); + Value *DstPtr = getGeneralAccessPointerOperand(DstA); + + switch (underlyingObjectsAlias(AA, F->getParent()->getDataLayout(), + *DstA->Loc, *SrcA->Loc)) { + case AliasResult::MayAlias: + case AliasResult::PartialAlias: + // cannot analyse objects if we don't understand their aliasing. + LLVM_DEBUG(dbgs() << "can't analyze may or partial alias\n"); + return std::make_unique(Src, Dst); + case AliasResult::NoAlias: + // If the objects noalias, they are distinct, accesses are independent. + LLVM_DEBUG(dbgs() << "no alias\n"); + return nullptr; + case AliasResult::MustAlias: + break; // The underlying objects alias; test accesses for dependence. + } + + // If either Src or Dst is a call, and we are uncertain about the accessed + // location's size, give up. + if (isa(Src)) + if (!SrcA->Loc->Size.hasValue()) + return std::make_unique(Src, Dst); + if (isa(Dst)) + if (!DstA->Loc->Size.hasValue()) + return std::make_unique(Src, Dst); + + // establish loop nesting levels + establishNestingLevels(Src, Dst); + LLVM_DEBUG(dbgs() << " common nesting levels = " << CommonLevels << "\n"); + LLVM_DEBUG(dbgs() << " maximum nesting levels = " << MaxLevels << "\n"); + + FullDependence Result(Src, Dst, PossiblyLoopIndependent, CommonLevels); + ++TotalArrayPairs; + + unsigned Pairs = 1; + SmallVector Pair(Pairs); + if (!SE->isSCEVable(SrcPtr->getType()) || + !SE->isSCEVable(DstPtr->getType())) { + LLVM_DEBUG(dbgs() << "can't analyze non-scevable pointers\n"); + return std::make_unique(Src, Dst); + } + const SCEV *SrcSCEV = SE->getSCEV(SrcPtr); + const SCEV *DstSCEV = SE->getSCEV(DstPtr); + LLVM_DEBUG(dbgs() << " SrcSCEV = " << *SrcSCEV << "\n"); + LLVM_DEBUG(dbgs() << " DstSCEV = " << *DstSCEV << "\n"); + if (SE->getPointerBase(SrcSCEV) != SE->getPointerBase(DstSCEV)) { + // If two pointers have different bases, trying to analyze indexes won't + // work; we can't compare them to each other. This can happen, for example, + // if one is produced by an LCSSA PHI node. + // + // We check this upfront so we don't crash in cases where getMinusSCEV() + // returns a SCEVCouldNotCompute. + LLVM_DEBUG(dbgs() << "can't analyze SCEV with different pointer base\n"); + return std::make_unique(Src, Dst); + } + Pair[0].Src = SrcSCEV; + Pair[0].Dst = DstSCEV; + + if (Delinearize) { + if (tryDelinearize(SrcA, DstA, Pair)) { + LLVM_DEBUG(dbgs() << " delinearized\n"); + Pairs = Pair.size(); + } + } + + for (unsigned P = 0; P < Pairs; ++P) { + Pair[P].Loops.resize(MaxLevels + 1); + Pair[P].GroupLoops.resize(MaxLevels + 1); + Pair[P].Group.resize(Pairs); + removeMatchingExtensions(&Pair[P]); + Pair[P].Classification = + classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()), + Pair[P].Dst, LI->getLoopFor(Dst->getParent()), + Pair[P].Loops); + Pair[P].GroupLoops = Pair[P].Loops; + Pair[P].Group.set(P); + LLVM_DEBUG(dbgs() << " subscript " << P << "\n"); + LLVM_DEBUG(dbgs() << "\tsrc = " << *Pair[P].Src << "\n"); + LLVM_DEBUG(dbgs() << "\tdst = " << *Pair[P].Dst << "\n"); + LLVM_DEBUG(dbgs() << "\tclass = " << Pair[P].Classification << "\n"); + LLVM_DEBUG(dbgs() << "\tloops = "); + LLVM_DEBUG(dumpSmallBitVector(Pair[P].Loops)); + } + + SmallBitVector Separable(Pairs); + SmallBitVector Coupled(Pairs); + + // Partition subscripts into separable and minimally-coupled groups + // Algorithm in paper is algorithmically better; + // this may be faster in practice. Check someday. + // + // Here's an example of how it works. Consider this code: + // + // for (i = ...) { + // for (j = ...) { + // for (k = ...) { + // for (l = ...) { + // for (m = ...) { + // A[i][j][k][m] = ...; + // ... = A[0][j][l][i + j]; + // } + // } + // } + // } + // } + // + // There are 4 subscripts here: + // 0 [i] and [0] + // 1 [j] and [j] + // 2 [k] and [l] + // 3 [m] and [i + j] + // + // We've already classified each subscript pair as ZIV, SIV, etc., + // and collected all the loops mentioned by pair P in Pair[P].Loops. + // In addition, we've initialized Pair[P].GroupLoops to Pair[P].Loops + // and set Pair[P].Group = {P}. + // + // Src Dst Classification Loops GroupLoops Group + // 0 [i] [0] SIV {1} {1} {0} + // 1 [j] [j] SIV {2} {2} {1} + // 2 [k] [l] RDIV {3,4} {3,4} {2} + // 3 [m] [i + j] MIV {1,2,5} {1,2,5} {3} + // + // For each subscript SI 0 .. 3, we consider each remaining subscript, SJ. + // So, 0 is compared against 1, 2, and 3; 1 is compared against 2 and 3, etc. + // + // We begin by comparing 0 and 1. The intersection of the GroupLoops is empty. + // Next, 0 and 2. Again, the intersection of their GroupLoops is empty. + // Next 0 and 3. The intersection of their GroupLoop = {1}, not empty, + // so Pair[3].Group = {0,3} and Done = false (that is, 0 will not be added + // to either Separable or Coupled). + // + // Next, we consider 1 and 2. The intersection of the GroupLoops is empty. + // Next, 1 and 3. The intersection of their GroupLoops = {2}, not empty, + // so Pair[3].Group = {0, 1, 3} and Done = false. + // + // Next, we compare 2 against 3. The intersection of the GroupLoops is empty. + // Since Done remains true, we add 2 to the set of Separable pairs. + // + // Finally, we consider 3. There's nothing to compare it with, + // so Done remains true and we add it to the Coupled set. + // Pair[3].Group = {0, 1, 3} and GroupLoops = {1, 2, 5}. + // + // In the end, we've got 1 separable subscript and 1 coupled group. + for (unsigned SI = 0; SI < Pairs; ++SI) { + if (Pair[SI].Classification == Subscript::NonLinear) { + // ignore these, but collect loops for later + ++NonlinearSubscriptPairs; + collectCommonLoops(Pair[SI].Src, + LI->getLoopFor(Src->getParent()), + Pair[SI].Loops); + collectCommonLoops(Pair[SI].Dst, + LI->getLoopFor(Dst->getParent()), + Pair[SI].Loops); + Result.Consistent = false; + } else if (Pair[SI].Classification == Subscript::ZIV) { + // always separable + Separable.set(SI); + } + else { + // SIV, RDIV, or MIV, so check for coupled group + bool Done = true; + for (unsigned SJ = SI + 1; SJ < Pairs; ++SJ) { + SmallBitVector Intersection = Pair[SI].GroupLoops; + Intersection &= Pair[SJ].GroupLoops; + if (Intersection.any()) { + // accumulate set of all the loops in group + Pair[SJ].GroupLoops |= Pair[SI].GroupLoops; + // accumulate set of all subscripts in group + Pair[SJ].Group |= Pair[SI].Group; + Done = false; + } + } + if (Done) { + if (Pair[SI].Group.count() == 1) { + Separable.set(SI); + ++SeparableSubscriptPairs; + } + else { + Coupled.set(SI); + ++CoupledSubscriptPairs; + } + } + } + } + + LLVM_DEBUG(dbgs() << " Separable = "); + LLVM_DEBUG(dumpSmallBitVector(Separable)); + LLVM_DEBUG(dbgs() << " Coupled = "); + LLVM_DEBUG(dumpSmallBitVector(Coupled)); + + Constraint NewConstraint; + NewConstraint.setAny(SE); + + // test separable subscripts + for (unsigned SI : Separable.set_bits()) { + LLVM_DEBUG(dbgs() << "testing subscript " << SI); + switch (Pair[SI].Classification) { + case Subscript::ZIV: + LLVM_DEBUG(dbgs() << ", ZIV\n"); + if (testZIV(Pair[SI].Src, Pair[SI].Dst, Result)) + return nullptr; + break; + case Subscript::SIV: { + LLVM_DEBUG(dbgs() << ", SIV\n"); + unsigned Level; + const SCEV *SplitIter = nullptr; + if (testSIV(Pair[SI].Src, Pair[SI].Dst, Level, Result, NewConstraint, + SplitIter)) + return nullptr; + break; + } + case Subscript::RDIV: + LLVM_DEBUG(dbgs() << ", RDIV\n"); + if (testRDIV(Pair[SI].Src, Pair[SI].Dst, Result)) + return nullptr; + break; + case Subscript::MIV: + LLVM_DEBUG(dbgs() << ", MIV\n"); + if (testMIV(Pair[SI].Src, Pair[SI].Dst, Pair[SI].Loops, Result)) + return nullptr; + break; + default: + llvm_unreachable("subscript has unexpected classification"); + } + } + + if (Coupled.count()) { + // test coupled subscript groups + LLVM_DEBUG(dbgs() << "starting on coupled subscripts\n"); + LLVM_DEBUG(dbgs() << "MaxLevels + 1 = " << MaxLevels + 1 << "\n"); + SmallVector Constraints(MaxLevels + 1); + for (unsigned II = 0; II <= MaxLevels; ++II) + Constraints[II].setAny(SE); + for (unsigned SI : Coupled.set_bits()) { + LLVM_DEBUG(dbgs() << "testing subscript group " << SI << " { "); + SmallBitVector Group(Pair[SI].Group); + SmallBitVector Sivs(Pairs); + SmallBitVector Mivs(Pairs); + SmallBitVector ConstrainedLevels(MaxLevels + 1); + SmallVector PairsInGroup; + for (unsigned SJ : Group.set_bits()) { + LLVM_DEBUG(dbgs() << SJ << " "); + if (Pair[SJ].Classification == Subscript::SIV) + Sivs.set(SJ); + else + Mivs.set(SJ); + PairsInGroup.push_back(&Pair[SJ]); + } + unifySubscriptType(PairsInGroup); + LLVM_DEBUG(dbgs() << "}\n"); + while (Sivs.any()) { + bool Changed = false; + for (unsigned SJ : Sivs.set_bits()) { + LLVM_DEBUG(dbgs() << "testing subscript " << SJ << ", SIV\n"); + // SJ is an SIV subscript that's part of the current coupled group + unsigned Level; + const SCEV *SplitIter = nullptr; + LLVM_DEBUG(dbgs() << "SIV\n"); + if (testSIV(Pair[SJ].Src, Pair[SJ].Dst, Level, Result, NewConstraint, + SplitIter)) + return nullptr; + ConstrainedLevels.set(Level); + if (intersectConstraints(&Constraints[Level], &NewConstraint)) { + if (Constraints[Level].isEmpty()) { + ++DeltaIndependence; + return nullptr; + } + Changed = true; + } + Sivs.reset(SJ); + } + if (Changed) { + // propagate, possibly creating new SIVs and ZIVs + LLVM_DEBUG(dbgs() << " propagating\n"); + LLVM_DEBUG(dbgs() << "\tMivs = "); + LLVM_DEBUG(dumpSmallBitVector(Mivs)); + for (unsigned SJ : Mivs.set_bits()) { + // SJ is an MIV subscript that's part of the current coupled group + LLVM_DEBUG(dbgs() << "\tSJ = " << SJ << "\n"); + if (propagate(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, + Constraints, Result.Consistent)) { + LLVM_DEBUG(dbgs() << "\t Changed\n"); + ++DeltaPropagations; + Pair[SJ].Classification = + classifyPair(Pair[SJ].Src, LI->getLoopFor(Src->getParent()), + Pair[SJ].Dst, LI->getLoopFor(Dst->getParent()), + Pair[SJ].Loops); + switch (Pair[SJ].Classification) { + case Subscript::ZIV: + LLVM_DEBUG(dbgs() << "ZIV\n"); + if (testZIV(Pair[SJ].Src, Pair[SJ].Dst, Result)) + return nullptr; + Mivs.reset(SJ); + break; + case Subscript::SIV: + Sivs.set(SJ); + Mivs.reset(SJ); + break; + case Subscript::RDIV: + case Subscript::MIV: + break; + default: + llvm_unreachable("bad subscript classification"); + } + } + } + } + } + + // test & propagate remaining RDIVs + for (unsigned SJ : Mivs.set_bits()) { + if (Pair[SJ].Classification == Subscript::RDIV) { + LLVM_DEBUG(dbgs() << "RDIV test\n"); + if (testRDIV(Pair[SJ].Src, Pair[SJ].Dst, Result)) + return nullptr; + // I don't yet understand how to propagate RDIV results + Mivs.reset(SJ); + } + } + + // test remaining MIVs + // This code is temporary. + // Better to somehow test all remaining subscripts simultaneously. + for (unsigned SJ : Mivs.set_bits()) { + if (Pair[SJ].Classification == Subscript::MIV) { + LLVM_DEBUG(dbgs() << "MIV test\n"); + if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result)) + return nullptr; + } + else + llvm_unreachable("expected only MIV subscripts at this point"); + } + + // update Result.DV from constraint vector + LLVM_DEBUG(dbgs() << " updating\n"); + for (unsigned SJ : ConstrainedLevels.set_bits()) { + if (SJ > CommonLevels) + break; + updateDirection(Result.DV[SJ - 1], Constraints[SJ]); + if (Result.DV[SJ - 1].Direction == Dependence::DVEntry::NONE) + return nullptr; + } + } + } + + // Make sure the Scalar flags are set correctly. + SmallBitVector CompleteLoops(MaxLevels + 1); + for (unsigned SI = 0; SI < Pairs; ++SI) + CompleteLoops |= Pair[SI].Loops; + for (unsigned II = 1; II <= CommonLevels; ++II) + if (CompleteLoops[II]) + Result.DV[II - 1].Scalar = false; + + if (PossiblyLoopIndependent) { + // Make sure the LoopIndependent flag is set correctly. + // All directions must include equal, otherwise no + // loop-independent dependence is possible. + for (unsigned II = 1; II <= CommonLevels; ++II) { + if (!(Result.getDirection(II) & Dependence::DVEntry::EQ)) { + Result.LoopIndependent = false; + break; + } + } + } + else { + // On the other hand, if all directions are equal and there's no + // loop-independent dependence possible, then no dependence exists. + bool AllEqual = true; + for (unsigned II = 1; II <= CommonLevels; ++II) { + if (Result.getDirection(II) != Dependence::DVEntry::EQ) { + AllEqual = false; + break; + } + } + if (AllEqual) + return nullptr; + } + + return std::make_unique(std::move(Result)); +} diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index 345e5a0195201c0..e7c963acec409c3 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -490,6 +490,8 @@ class CallAnalyzer : public InstVisitor { bool visitCleanupReturnInst(CleanupReturnInst &RI); bool visitCatchReturnInst(CatchReturnInst &RI); bool visitUnreachableInst(UnreachableInst &I); + bool visitReattachInst(ReattachInst &RI); + bool visitSyncInst(SyncInst &RI); public: CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI, @@ -2328,6 +2330,11 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) { return simplifyIntrinsicCallIsConstant(Call); case Intrinsic::objectsize: return simplifyIntrinsicCallObjectSize(Call); + case Intrinsic::detached_rethrow: + case Intrinsic::taskframe_resume: + // Similarly to returns from a spawned task, we treat detached.rethrow and + // taskframe.resume intrinsics as free. + return true; } } @@ -2511,6 +2518,16 @@ bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { return true; // No actual code is needed for unreachable. } +bool CallAnalyzer::visitReattachInst(ReattachInst &RI) { + // We model reattach instructions as free, sort of like return instructions. + return true; +} + +bool CallAnalyzer::visitSyncInst(SyncInst &SI) { + // We model sync instructions as free, sort of like unconditional branches. + return true; +} + bool CallAnalyzer::visitInstruction(Instruction &I) { // Some instructions are free. All of the free intrinsics can also be // handled by SROA, etc. diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index f3fc69c86cd1e6e..4dfd99b60c74bac 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -30,6 +30,7 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -147,6 +148,13 @@ static cl::opt HoistRuntimeChecks( cl::location(VectorizerParams::HoistRuntimeChecks), cl::init(true)); bool VectorizerParams::HoistRuntimeChecks; +/// Enable analysis using Tapir based on the data-race-free assumption. +static cl::opt EnableDRFAA( + "enable-drf-laa", cl::Hidden, + cl::desc("Enable analysis using Tapir based on the data-race-free " + "assumption"), + cl::init(false)); + bool VectorizerParams::isInterleaveForced() { return ::VectorizationInterleave.getNumOccurrences() > 0; } @@ -1795,6 +1803,12 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) { Status = S; } +/// Returns true if this loop is logically parallel as indicated by Tapir. +static bool isLogicallyParallelViaTapir(const Loop *L, TaskInfo *TI) { + return L->wasDerivedFromTapirLoop() || + (TI && getTaskIfTapirLoopStructure(L, TI)); +} + /// Given a dependence-distance \p Dist between two /// memory accesses, that have strides in the same direction whose absolute /// value of the maximum stride is given in \p MaxStride, and that have the same @@ -1912,6 +1926,11 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( Type *ATy = getLoadStoreType(AInst); Type *BTy = getLoadStoreType(BInst); + // Under certain assumptions, Tapir can guarantee that there are no + // loop-carried dependencies. + if (EnableDRFAA && isLogicallyParallelViaTapir(InnermostLoop, TI)) + return Dependence::NoDep; + // We cannot check pointers in different address spaces. if (APtr->getType()->getPointerAddressSpace() != BPtr->getType()->getPointerAddressSpace()) @@ -2290,6 +2309,12 @@ bool MemoryDepChecker::areDepsSafe(const DepCandidates &AccessSets, Dependence::DepType Type = isDependent(*A.first, A.second, *B.first, B.second); + // Backward dependencies cannot happen in Tapir loops. + if ((Dependence::Backward == Type || + Dependence::BackwardVectorizable == Type || + Dependence::BackwardVectorizableButPreventsForwarding == Type) + && isLogicallyParallelViaTapir(InnermostLoop, TI)) + Type = Dependence::NoDep; mergeInStatus(Dependence::isSafeForVectorization(Type)); // Gather dependences unless we accumulated MaxDependences @@ -2390,7 +2415,7 @@ bool LoopAccessInfo::canAnalyzeLoop() { bool LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, const TargetLibraryInfo *TLI, - DominatorTree *DT) { + DominatorTree *DT, TaskInfo *TI) { // Holds the Load and Store instructions. SmallVector Loads; SmallVector Stores; @@ -2408,7 +2433,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, PtrRtChecking->Pointers.clear(); PtrRtChecking->Need = false; - const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel() || + (EnableDRFAA && isLogicallyParallelViaTapir(TheLoop, TI)); const bool EnableMemAccessVersioningOfLoop = EnableMemAccessVersioning && @@ -2458,6 +2484,10 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, !VFDatabase::getMappings(*Call).empty()) continue; + // Ignore Tapir instructions. + if (isa(&I) || isa(&I) || isa(&I)) + continue; + auto *Ld = dyn_cast(&I); if (!Ld) { recordAnalysis("CantVectorizeInstruction", Ld) @@ -2482,6 +2512,11 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, // Save 'store' instructions. Abort if other instructions write to memory. if (I.mayWriteToMemory()) { + // TODO: Determine if we should do something other than ignore Tapir + // instructions here. + if (isa(&I) || isa(&I) || isa(&I)) + continue; + auto *St = dyn_cast(&I); if (!St) { recordAnalysis("CantVectorizeInstruction", St) @@ -2994,7 +3029,7 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) { LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetTransformInfo *TTI, const TargetLibraryInfo *TLI, AAResults *AA, - DominatorTree *DT, LoopInfo *LI) + DominatorTree *DT, LoopInfo *LI, TaskInfo *TI) : PSE(std::make_unique(*SE, *L)), PtrRtChecking(nullptr), TheLoop(L) { unsigned MaxTargetVectorWidthInBits = std::numeric_limits::max(); @@ -3016,7 +3051,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, MaxTargetVectorWidthInBits); PtrRtChecking = std::make_unique(*DepChecker, SE); if (canAnalyzeLoop()) - CanVecMem = analyzeLoop(AA, LI, TLI, DT); + CanVecMem = analyzeLoop(AA, LI, TLI, DT, TI); } void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { @@ -3072,7 +3107,7 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) { if (Inserted) It->second = - std::make_unique(&L, &SE, TTI, TLI, &AA, &DT, &LI); + std::make_unique(&L, &SE, TTI, TLI, &AA, &DT, &LI, &TI); return *It->second; } @@ -3108,7 +3143,8 @@ bool LoopAccessInfoManager::invalidate( return Inv.invalidate(F, PA) || Inv.invalidate(F, PA) || Inv.invalidate(F, PA) || - Inv.invalidate(F, PA); + Inv.invalidate(F, PA) || + Inv.invalidate(F, PA); } LoopAccessInfoManager LoopAccessAnalysis::run(Function &F, @@ -3119,7 +3155,8 @@ LoopAccessInfoManager LoopAccessAnalysis::run(Function &F, auto &LI = FAM.getResult(F); auto &TTI = FAM.getResult(F); auto &TLI = FAM.getResult(F); - return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI); + auto &TI = FAM.getResult(F); + return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &TI); } AnalysisKey LoopAccessAnalysis::Key; diff --git a/llvm/lib/Analysis/LoopAnalysisManager.cpp b/llvm/lib/Analysis/LoopAnalysisManager.cpp index 74c318ee5b975b3..ae561f35b198f17 100644 --- a/llvm/lib/Analysis/LoopAnalysisManager.cpp +++ b/llvm/lib/Analysis/LoopAnalysisManager.cpp @@ -11,6 +11,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/PassManagerImpl.h" #include @@ -54,6 +55,7 @@ bool LoopAnalysisManagerFunctionProxy::Result::invalidate( Inv.invalidate(F, PA) || Inv.invalidate(F, PA) || Inv.invalidate(F, PA) || + Inv.invalidate(F, PA) || invalidateMemorySSAAnalysis) { // Note that the LoopInfo may be stale at this point, however the loop // objects themselves remain the only viable keys that could be in the @@ -141,5 +143,6 @@ PreservedAnalyses llvm::getLoopPassPreservedAnalyses() { PA.preserve(); PA.preserve(); PA.preserve(); + PA.preserve(); return PA; } diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp index 6bb5f001e9bd1d9..2918ddca7e69c8e 100644 --- a/llvm/lib/Analysis/LoopInfo.cpp +++ b/llvm/lib/Analysis/LoopInfo.cpp @@ -58,6 +58,236 @@ static cl::opt // Loop implementation // +// Returns true if the basic block Succ that succeeds BB is the unwind +// destination of a detach. +static bool succIsDetachUnwind(const BasicBlock *BB, const BasicBlock *Succ) { + if (const DetachInst *DI = dyn_cast(BB->getTerminator())) + return Succ == DI->getUnwindDest(); + return false; +} + +/// Returns true if the given instruction performs a taskframe resume, false +/// otherwise. +static bool isDetachedRethrow(const Instruction *I, + const Value *SyncReg = nullptr) { + if (const InvokeInst *II = dyn_cast(I)) + if (const Function *Called = II->getCalledFunction()) + if (Intrinsic::detached_rethrow == Called->getIntrinsicID()) + if (!SyncReg || (SyncReg == II->getArgOperand(0))) + return true; + return false; +} + +/// Returns true if the given instruction performs a taskframe resume, false +/// otherwise. +static bool isTaskFrameResume(const Instruction *I, + const Value *TaskFrame = nullptr) { + if (const InvokeInst *II = dyn_cast(I)) + if (const Function *Called = II->getCalledFunction()) + if (Intrinsic::taskframe_resume == Called->getIntrinsicID()) + if (!TaskFrame || (TaskFrame == II->getArgOperand(0))) + return true; + return false; +} + +/// Returns true if the given basic block is a placeholder successor of a +/// taskframe.resume or detached.rethrow. +static bool isTapirPlaceholderSuccessor(const BasicBlock *B) { + for (const BasicBlock *Pred : predecessors(B)) { + if (!isDetachedRethrow(Pred->getTerminator()) && + !isTaskFrameResume(Pred->getTerminator())) + return false; + + const InvokeInst *II = dyn_cast(Pred->getTerminator()); + if (B != II->getNormalDest()) + return false; + } + return true; +} + +/// Helper method to find loop-exit blocks that are contained within tasks +/// spawned within the loop. +static void getTaskExitsHelper(BasicBlock *TaskEntry, const Value *SyncRegion, + const Loop *L, + SmallPtrSetImpl &TaskExits) { + // Traverse the CFG to find the exit blocks from SubT. + SmallVector Worklist; + SmallPtrSet Visited; + Worklist.push_back(TaskEntry); + while (!Worklist.empty()) { + BasicBlock *BB = Worklist.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + // Record any block found in the task that is not contained in the loop + if (!L->contains(BB)) + TaskExits.insert(BB); + + // Stop the CFG traversal at any reattach or detached.rethrow in the same + // sync region. + if (ReattachInst *RI = dyn_cast(BB->getTerminator())) + if (SyncRegion == RI->getSyncRegion()) + continue; + if (isDetachedRethrow(BB->getTerminator(), SyncRegion)) + continue; + + // For all other basic blocks, traverse all successors + for (BasicBlock *Succ : successors(BB)) + Worklist.push_back(Succ); + } +} + +/// getTaskExits - Get basic blocks that are outside of the loop, based on CFG +/// analysis, but inside tasks created within the loop. +/// +void Loop::getTaskExits(SmallPtrSetImpl &TaskExits) const { + SmallVector, 4> TaskEntriesToCheck; + for (auto *BB : blocks()) + if (DetachInst *DI = dyn_cast(BB->getTerminator())) + if (DI->hasUnwindDest()) + if (!contains(DI->getUnwindDest())) + TaskEntriesToCheck.push_back( + std::make_pair(DI->getDetached(), DI->getSyncRegion())); + + for (std::pair &TaskEntry : TaskEntriesToCheck) + getTaskExitsHelper(TaskEntry.first, TaskEntry.second, this, TaskExits); +} + +/// getExitingBlocks - Return all blocks inside the loop that have successors +/// outside of the loop. These are the blocks _inside of the current loop_ +/// which branch out. The returned list is always unique. +/// +void Loop::getExitingBlocks(SmallVectorImpl &ExitingBlocks, + bool IgnoreDetachUnwind) const { + assert(!isInvalid() && "Loop not in a valid state!"); + for (const auto BB : blocks()) + for (const auto *Succ : children(BB)) + if (!contains(Succ)) { + if (IgnoreDetachUnwind && succIsDetachUnwind(BB, Succ)) + continue; + // Not in current loop? It must be an exit block. + ExitingBlocks.push_back(BB); + break; + } +} + +/// getExitingBlock - If getExitingBlocks would return exactly one block, +/// return that block. Otherwise return null. +BasicBlock *Loop::getExitingBlock(bool IgnoreDetachUnwind) const { + assert(!isInvalid() && "Loop not in a valid state!"); + SmallVector ExitingBlocks; + getExitingBlocks(ExitingBlocks, IgnoreDetachUnwind); + if (ExitingBlocks.size() == 1) + return ExitingBlocks[0]; + return nullptr; +} + +/// getExitBlocks - Return all of the successor blocks of this loop. These +/// are the blocks _outside of the current loop_ which are branched to. +/// +void Loop::getExitBlocks( + SmallVectorImpl &ExitBlocks) const { + assert(!isInvalid() && "Loop not in a valid state!"); + std::vector Blocks(block_begin(), block_end()); + SmallPtrSet TaskExits; + getTaskExits(TaskExits); + Blocks.insert(Blocks.end(), TaskExits.begin(), TaskExits.end()); + + for (const auto BB : Blocks) + for (auto *Succ : children(BB)) + if (!contains(Succ) && !TaskExits.count(Succ) && + !isTapirPlaceholderSuccessor(Succ)) + // Not in current loop? It must be an exit block. + ExitBlocks.push_back(Succ); +} + +/// getExitBlock - If getExitBlocks would return exactly one block, +/// return that block. Otherwise return null. +BasicBlock *Loop::getExitBlock() const { + assert(!isInvalid() && "Loop not in a valid state!"); + SmallVector ExitBlocks; + getExitBlocks(ExitBlocks); + if (ExitBlocks.size() == 1) + return ExitBlocks[0]; + return nullptr; +} + +bool Loop::hasDedicatedExits() const { + // Each predecessor of each exit block of a normal loop is contained + // within the loop. + SmallVector UniqueExitBlocks; + getUniqueExitBlocks(UniqueExitBlocks); + SmallPtrSet TaskExits; + getTaskExits(TaskExits); + + for (BasicBlock *EB : UniqueExitBlocks) + for (BasicBlock *Predecessor : children>(EB)) + if (!contains(Predecessor) && !TaskExits.count(Predecessor)) + return false; + // All the requirements are met. + return true; +} + +// Helper function to get unique loop exits. Pred is a predicate pointing to +// BasicBlocks in a loop which should be considered to find loop exits. +template +void getUniqueExitBlocksOutsideTasksHelper( + const Loop *L, SmallVectorImpl &ExitBlocks, PredicateT Pred) { + assert(!L->isInvalid() && "Loop not in a valid state!"); + SmallPtrSet Visited; + std::vector Blocks(L->block_begin(), L->block_end()); + SmallPtrSet TaskExits; + L->getTaskExits(TaskExits); + Blocks.insert(Blocks.end(), TaskExits.begin(), TaskExits.end()); + + auto Filtered = make_filter_range(Blocks, Pred); + for (BasicBlock *BB : Filtered) { + for (BasicBlock *Successor : children(BB)) + if (!L->contains(Successor) && !TaskExits.count(Successor) && + !isTapirPlaceholderSuccessor(Successor)) + if (Visited.insert(Successor).second) + ExitBlocks.push_back(Successor); + } +} + +void Loop::getUniqueExitBlocks( + SmallVectorImpl &ExitBlocks) const { + getUniqueExitBlocksOutsideTasksHelper( + this, ExitBlocks, [](const BasicBlock *BB) { return true; }); +} + +void Loop::getUniqueNonLatchExitBlocks( + SmallVectorImpl &ExitBlocks) const { + const BasicBlock *Latch = getLoopLatch(); + assert(Latch && "Latch block must exists"); + getUniqueExitBlocksOutsideTasksHelper( + this, ExitBlocks, [Latch](const BasicBlock *BB) { return BB != Latch; }); +} + +BasicBlock *Loop::getUniqueExitBlock() const { + SmallVector UniqueExitBlocks; + getUniqueExitBlocks(UniqueExitBlocks); + if (UniqueExitBlocks.size() == 1) + return UniqueExitBlocks[0]; + return nullptr; +} + +/// getExitEdges - Return all pairs of (_inside_block_,_outside_block_). +void Loop::getExitEdges(SmallVectorImpl &ExitEdges) const { + assert(!isInvalid() && "Loop not in a valid state!"); + std::vector Blocks(block_begin(), block_end()); + SmallPtrSet TaskExits; + getTaskExits(TaskExits); + Blocks.insert(Blocks.end(), TaskExits.begin(), TaskExits.end()); + + for (const auto BB : Blocks) + for (auto *Succ : children(BB)) + if (!contains(Succ) && !TaskExits.count(Succ) && + !isTapirPlaceholderSuccessor(Succ)) + // Not in current loop? It must be an exit block. + ExitEdges.emplace_back(BB, Succ); +} + bool Loop::isLoopInvariant(const Value *V) const { if (const Instruction *I = dyn_cast(V)) return !contains(I); @@ -430,7 +660,9 @@ bool Loop::isCanonical(ScalarEvolution &SE) const { // Check that 'BB' doesn't have any uses outside of the 'L' static bool isBlockInLCSSAForm(const Loop &L, const BasicBlock &BB, - const DominatorTree &DT, bool IgnoreTokens) { + const DominatorTree &DT, + SmallPtrSetImpl &TaskExits, + bool IgnoreTokens) { for (const Instruction &I : BB) { // Tokens can't be used in PHI nodes and live-out tokens prevent loop // optimizations, so for the purposes of considered LCSSA form, we @@ -452,7 +684,7 @@ static bool isBlockInLCSSAForm(const Loop &L, const BasicBlock &BB, // the use is anywhere in the loop. Most values are used in the same // block they are defined in. Also, blocks not reachable from the // entry are special; uses in them don't need to go through PHIs. - if (UserBB != &BB && !L.contains(UserBB) && + if (UserBB != &BB && !L.contains(UserBB) && !TaskExits.count(UserBB) && DT.isReachableFromEntry(UserBB)) return false; } @@ -462,8 +694,10 @@ static bool isBlockInLCSSAForm(const Loop &L, const BasicBlock &BB, bool Loop::isLCSSAForm(const DominatorTree &DT, bool IgnoreTokens) const { // For each block we check that it doesn't have any uses outside of this loop. + SmallPtrSet TaskExits; + getTaskExits(TaskExits); return all_of(this->blocks(), [&](const BasicBlock *BB) { - return isBlockInLCSSAForm(*this, *BB, DT, IgnoreTokens); + return isBlockInLCSSAForm(*this, *BB, DT, TaskExits, IgnoreTokens); }); } @@ -472,8 +706,11 @@ bool Loop::isRecursivelyLCSSAForm(const DominatorTree &DT, const LoopInfo &LI, // For each block we check that it doesn't have any uses outside of its // innermost loop. This process will transitively guarantee that the current // loop and all of the nested loops are in LCSSA form. + SmallPtrSet TaskExits; + getTaskExits(TaskExits); return all_of(this->blocks(), [&](const BasicBlock *BB) { - return isBlockInLCSSAForm(*LI.getLoopFor(BB), *BB, DT, IgnoreTokens); + return isBlockInLCSSAForm(*LI.getLoopFor(BB), *BB, DT, TaskExits, + IgnoreTokens); }); } @@ -562,6 +799,31 @@ void Loop::setLoopMustProgress() { setLoopID(NewLoopID); } +void Loop::setDerivedFromTapirLoop() { + LLVMContext &Context = getHeader()->getContext(); + + MDNode *FromTapir = findOptionMDForLoop(this, "llvm.loop.fromtapirloop"); + + if (FromTapir) + return; + + MDNode *FromTapirMD = + MDNode::get(Context, MDString::get(Context, "llvm.loop.fromtapirloop")); + MDNode *LoopID = getLoopID(); + MDNode *NewLoopID = + makePostTransformationMetadata(Context, LoopID, {}, {FromTapirMD}); + setLoopID(NewLoopID); +} + +bool Loop::wasDerivedFromTapirLoop() const { + MDNode *FromTapir = findOptionMDForLoop(this, "llvm.loop.fromtapirloop"); + + if (FromTapir) + return true; + + return false; +} + bool Loop::isAnnotatedParallel() const { MDNode *DesiredLoopIdMetadata = getLoopID(); diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp index 1edc51e9ce5da3e..c26c14298267bef 100644 --- a/llvm/lib/Analysis/MemoryBuiltins.cpp +++ b/llvm/lib/Analysis/MemoryBuiltins.cpp @@ -375,6 +375,27 @@ static bool CheckedZextOrTrunc(APInt &I, unsigned IntTyBits) { return true; } +std::pair +llvm::getAllocSizeArgs(const CallBase *CB, const TargetLibraryInfo *TLI) { + // Note: This handles both explicitly listed allocation functions and + // allocsize. The code structure could stand to be cleaned up a bit. + const std::optional FnData = getAllocationSize(CB, TLI); + if (!FnData) + return std::make_pair(nullptr, nullptr); + + // Don't handle strdup-like functions. + if (FnData->AllocTy == StrDupLike) + return std::make_pair(nullptr, nullptr); + + if (FnData->SndParam < 0) + // Only have 1 size parameter. + return std::make_pair(CB->getArgOperand(FnData->FstParam), nullptr); + + // Have 2 size parameters. + return std::make_pair(CB->getArgOperand(FnData->FstParam), + CB->getArgOperand(FnData->SndParam)); +} + std::optional llvm::getAllocSize(const CallBase *CB, const TargetLibraryInfo *TLI, function_ref Mapper) { diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index 9f7baa983f1229b..4e8628408fee954 100644 --- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/PHITransAddr.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" @@ -80,6 +81,11 @@ static cl::opt cl::desc("The number of blocks to scan during memory " "dependency analysis (default = 200)")); +static cl::opt + EnableDRF("enable-drf-memdep", cl::init(false), cl::Hidden, + cl::desc("Allow MemoryDependenceAnalysis to assume the program " + "is data-race free.")); + // Limit on the number of memdep results to process. static const unsigned int NumResultsLimit = 100; @@ -185,6 +191,11 @@ MemDepResult MemoryDependenceResults::getCallDependencyFrom( BasicBlock *BB) { unsigned Limit = getDefaultBlockScanLimit(); + if (EnableDRF && TI) + if ((TI->getTaskFor(BB) != TI->getTaskFor(Call->getParent())) + && TI->mayHappenInParallel(Call->getParent(), BB)) + return MemDepResult::getNonLocal(); + // Walk backwards through the block, looking for dependencies. while (ScanIt != BB->begin()) { Instruction *Inst = &*--ScanIt; @@ -241,6 +252,10 @@ MemDepResult MemoryDependenceResults::getPointerDependencyFrom( const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt, BasicBlock *BB, Instruction *QueryInst, unsigned *Limit, BatchAAResults &BatchAA) { + if (EnableDRF && TI && QueryInst) + if ((TI->getTaskFor(BB) != TI->getTaskFor(QueryInst->getParent())) + && TI->mayHappenInParallel(QueryInst->getParent(), BB)) + return MemDepResult::getNonLocal(); MemDepResult InvariantGroupDependency = MemDepResult::getUnknown(); if (QueryInst != nullptr) { if (auto *LI = dyn_cast(QueryInst)) { @@ -1764,7 +1779,8 @@ MemoryDependenceAnalysis::run(Function &F, FunctionAnalysisManager &AM) { auto &AC = AM.getResult(F); auto &TLI = AM.getResult(F); auto &DT = AM.getResult(F); - return MemoryDependenceResults(AA, AC, TLI, DT, DefaultBlockScanLimit); + auto *TI = EnableDRF ? &AM.getResult(F) : nullptr; + return MemoryDependenceResults(AA, AC, TLI, DT, DefaultBlockScanLimit, TI); } char MemoryDependenceWrapperPass::ID = 0; @@ -1775,6 +1791,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) INITIALIZE_PASS_END(MemoryDependenceWrapperPass, "memdep", "Memory Dependence Analysis", false, true) @@ -1792,6 +1809,8 @@ void MemoryDependenceWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequired(); AU.addRequired(); + if (EnableDRF) + AU.addRequired(); AU.addRequiredTransitive(); AU.addRequiredTransitive(); } @@ -1823,6 +1842,8 @@ bool MemoryDependenceWrapperPass::runOnFunction(Function &F) { auto &AC = getAnalysis().getAssumptionCache(F); auto &TLI = getAnalysis().getTLI(F); auto &DT = getAnalysis().getDomTree(); - MemDep.emplace(AA, AC, TLI, DT, BlockScanLimit); + auto *TI = + EnableDRF ? &getAnalysis().getTaskInfo() : nullptr; + MemDep.emplace(AA, AC, TLI, DT, BlockScanLimit, TI); return false; } diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp index 48ef73e59045e74..b22d938fd4c4305 100644 --- a/llvm/lib/Analysis/MemorySSA.cpp +++ b/llvm/lib/Analysis/MemorySSA.cpp @@ -27,6 +27,7 @@ #include "llvm/Analysis/IteratedDominanceFrontier.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/AssemblyAnnotationWriter.h" #include "llvm/IR/BasicBlock.h" @@ -69,6 +70,7 @@ INITIALIZE_PASS_BEGIN(MemorySSAWrapperPass, "memoryssa", "Memory SSA", false, true) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) INITIALIZE_PASS_END(MemorySSAWrapperPass, "memoryssa", "Memory SSA", false, true) @@ -88,6 +90,15 @@ static cl::opt VerifyMemorySSAX("verify-memoryssa", cl::location(VerifyMemorySSA), cl::Hidden, cl::desc("Enable verification of MemorySSA.")); +static cl::opt + EnableDRF("enable-drf-memoryssa", cl::init(false), cl::Hidden, + cl::desc("Allow MemorySSA to assume the program is " + "data-race free.")); + +static cl::opt RequireTI("require-taskinfo-memoryssa", cl::init(true), + cl::Hidden, + cl::desc("Require TaskInfo for MemorySSA.")); + const static char LiveOnEntryStr[] = "liveOnEntry"; namespace { @@ -173,7 +184,7 @@ class MemoryLocOrCall { IsCall = false; // There is no such thing as a memorylocation for a fence inst, and it is // unique in that regard. - if (!isa(Inst)) + if (!isa(Inst) && !isa(Inst)) Loc = MemoryLocation::get(Inst); } } @@ -280,10 +291,25 @@ static bool areLoadsReorderable(const LoadInst *Use, template static bool instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc, - const Instruction *UseInst, AliasAnalysisType &AA) { + const Instruction *UseInst, AliasAnalysisType &AA, + TaskInfo *TI) { Instruction *DefInst = MD->getMemoryInst(); assert(DefInst && "Defining instruction not actually an instruction"); + if (TI && EnableDRF) + if ((TI->getTaskFor(MD->getBlock()) != + TI->getTaskFor(UseInst->getParent())) && + TI->mayHappenInParallel(MD->getBlock(), UseInst->getParent())) + return false; + + // Check for invokes of detached.rethrow, taskframe.resume, or sync.unwind. + if (const InvokeInst *II = dyn_cast(DefInst)) + if (const Function *Called = II->getCalledFunction()) + if (Intrinsic::detached_rethrow == Called->getIntrinsicID() || + Intrinsic::taskframe_resume == Called->getIntrinsicID() || + Intrinsic::sync_unwind == Called->getIntrinsicID()) + return false; + if (const IntrinsicInst *II = dyn_cast(DefInst)) { // These intrinsics will show up as affecting memory, but they are just // markers, mostly. @@ -297,6 +323,12 @@ instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc, case Intrinsic::allow_ubsan_check: case Intrinsic::invariant_start: case Intrinsic::invariant_end: + case Intrinsic::syncregion_start: + case Intrinsic::taskframe_create: + case Intrinsic::taskframe_use: + case Intrinsic::taskframe_end: + case Intrinsic::taskframe_load_guard: + case Intrinsic::sync_unwind: case Intrinsic::assume: case Intrinsic::experimental_noalias_scope_decl: case Intrinsic::pseudoprobe: @@ -311,7 +343,8 @@ instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc, } if (auto *CB = dyn_cast_or_null(UseInst)) { - ModRefInfo I = AA.getModRefInfo(DefInst, CB); + bool SameSpindle = false; + ModRefInfo I = AA.getModRefInfo(DefInst, CB, SameSpindle); return isModOrRefSet(I); } @@ -326,20 +359,20 @@ instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc, template static bool instructionClobbersQuery(MemoryDef *MD, const MemoryUseOrDef *MU, const MemoryLocOrCall &UseMLOC, - AliasAnalysisType &AA) { + AliasAnalysisType &AA, TaskInfo *TI) { // FIXME: This is a temporary hack to allow a single instructionClobbersQuery // to exist while MemoryLocOrCall is pushed through places. if (UseMLOC.IsCall) return instructionClobbersQuery(MD, MemoryLocation(), MU->getMemoryInst(), - AA); + AA, TI); return instructionClobbersQuery(MD, UseMLOC.getLoc(), MU->getMemoryInst(), - AA); + AA, TI); } // Return true when MD may alias MU, return false otherwise. bool MemorySSAUtil::defClobbersUseOrDef(MemoryDef *MD, const MemoryUseOrDef *MU, - AliasAnalysis &AA) { - return instructionClobbersQuery(MD, MU, MemoryLocOrCall(MU), AA); + AliasAnalysis &AA, TaskInfo *TI) { + return instructionClobbersQuery(MD, MU, MemoryLocOrCall(MU), AA, TI); } namespace { @@ -397,7 +430,7 @@ LLVM_ATTRIBUTE_UNUSED static void checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt, const MemoryLocation &StartLoc, const MemorySSA &MSSA, const UpwardsMemoryQuery &Query, BatchAAResults &AA, - bool AllowImpreciseClobber = false) { + TaskInfo *TI = nullptr, bool AllowImpreciseClobber = false) { assert(MSSA.dominates(ClobberAt, Start) && "Clobber doesn't dominate start?"); if (MSSA.isLiveOnEntryDef(Start)) { @@ -429,7 +462,7 @@ checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt, // since MD may only act as a clobber for 1 of N MemoryLocations. FoundClobber = FoundClobber || MSSA.isLiveOnEntryDef(MD); if (!FoundClobber) { - if (instructionClobbersQuery(MD, MAP.second, Query.Inst, AA)) + if (instructionClobbersQuery(MD, MAP.second, Query.Inst, AA, TI)) FoundClobber = true; } } @@ -444,7 +477,7 @@ checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt, if (MD == Start) continue; - assert(!instructionClobbersQuery(MD, MAP.second, Query.Inst, AA) && + assert(!instructionClobbersQuery(MD, MAP.second, Query.Inst, AA, TI) && "Found clobber before reaching ClobberAt!"); continue; } @@ -513,6 +546,7 @@ class ClobberWalker { const MemorySSA &MSSA; DominatorTree &DT; BatchAAResults *AA; + TaskInfo *TI; UpwardsMemoryQuery *Query; unsigned *UpwardWalkLimit; @@ -578,7 +612,7 @@ class ClobberWalker { if (!--*UpwardWalkLimit) return {Current, true}; - if (instructionClobbersQuery(MD, Desc.Loc, Query->Inst, *AA)) + if (instructionClobbersQuery(MD, Desc.Loc, Query->Inst, *AA, TI)) return {MD, true}; } } @@ -922,8 +956,8 @@ class ClobberWalker { } public: - ClobberWalker(const MemorySSA &MSSA, DominatorTree &DT) - : MSSA(MSSA), DT(DT) {} + ClobberWalker(const MemorySSA &MSSA, DominatorTree &DT, TaskInfo *TI) + : MSSA(MSSA), DT(DT), TI(TI) {} /// Finds the nearest clobber for the given query, optimizing phis if /// possible. @@ -959,7 +993,7 @@ class ClobberWalker { #ifdef EXPENSIVE_CHECKS if (!Q.SkipSelfAccess && *UpwardWalkLimit > 0) - checkClobberSanity(Current, Result, Q.StartingLoc, MSSA, Q, BAA); + checkClobberSanity(Current, Result, Q.StartingLoc, MSSA, Q, BAA, TI); #endif return Result; } @@ -990,7 +1024,8 @@ class MemorySSA::ClobberWalkerBase { MemorySSA *MSSA; public: - ClobberWalkerBase(MemorySSA *M, DominatorTree *D) : Walker(*M, *D), MSSA(M) {} + ClobberWalkerBase(MemorySSA *M, DominatorTree *D, TaskInfo *TI) + : Walker(*M, *D, TI), MSSA(M) {} MemoryAccess *getClobberingMemoryAccessBase(MemoryAccess *, const MemoryLocation &, @@ -1230,8 +1265,9 @@ void MemorySSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) { } } -MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT) - : DT(DT), F(&Func), LiveOnEntryDef(nullptr), Walker(nullptr), +MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT, + TaskInfo*TI) + : DT(DT), F(&Func), TI(TI), LiveOnEntryDef(nullptr), Walker(nullptr), SkipWalker(nullptr) { // Build MemorySSA using a batch alias analysis. This reuses the internal // state that AA collects during an alias()/getModRefInfo() call. This is @@ -1304,8 +1340,8 @@ namespace llvm { class MemorySSA::OptimizeUses { public: OptimizeUses(MemorySSA *MSSA, CachingWalker *Walker, BatchAAResults *BAA, - DominatorTree *DT) - : MSSA(MSSA), Walker(Walker), AA(BAA), DT(DT) {} + DominatorTree *DT, TaskInfo *TI) + : MSSA(MSSA), Walker(Walker), AA(BAA), DT(DT), TI(TI) {} void optimizeUses(); @@ -1335,6 +1371,7 @@ class MemorySSA::OptimizeUses { CachingWalker *Walker; BatchAAResults *AA; DominatorTree *DT; + TaskInfo *TI; }; } // end namespace llvm @@ -1466,7 +1503,7 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock( } MemoryDef *MD = cast(VersionStack[UpperBound]); - if (instructionClobbersQuery(MD, MU, UseMLOC, *AA)) { + if (instructionClobbersQuery(MD, MU, UseMLOC, *AA, TI)) { FoundClobberResult = true; break; } @@ -1594,7 +1631,7 @@ MemorySSA::CachingWalker *MemorySSA::getWalkerImpl() { return Walker.get(); if (!WalkerBase) - WalkerBase = std::make_unique(this, DT); + WalkerBase = std::make_unique(this, DT, TI); Walker = std::make_unique(this, WalkerBase.get()); return Walker.get(); @@ -1605,7 +1642,7 @@ MemorySSAWalker *MemorySSA::getSkipSelfWalker() { return SkipWalker.get(); if (!WalkerBase) - WalkerBase = std::make_unique(this, DT); + WalkerBase = std::make_unique(this, DT, TI); SkipWalker = std::make_unique(this, WalkerBase.get()); return SkipWalker.get(); @@ -1773,6 +1810,7 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I, case Intrinsic::assume: case Intrinsic::experimental_noalias_scope_decl: case Intrinsic::pseudoprobe: + case Intrinsic::syncregion_start: return nullptr; } } @@ -1783,6 +1821,10 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I, if (!I->mayReadFromMemory() && !I->mayWriteToMemory()) return nullptr; + // Ignore detach instructions. + if (isa(I)) + return nullptr; + bool Def, Use; if (Template) { Def = isa(Template); @@ -2202,9 +2244,9 @@ void MemorySSA::ensureOptimizedUses() { return; BatchAAResults BatchAA(*AA); - ClobberWalkerBase WalkerBase(this, DT); + ClobberWalkerBase WalkerBase(this, DT, TI); CachingWalker WalkerLocal(this, &WalkerBase); - OptimizeUses(this, &WalkerLocal, &BatchAA, DT).optimizeUses(); + OptimizeUses(this, &WalkerLocal, &BatchAA, DT, TI).optimizeUses(); IsOptimized = true; } @@ -2367,7 +2409,9 @@ MemorySSAAnalysis::Result MemorySSAAnalysis::run(Function &F, FunctionAnalysisManager &AM) { auto &DT = AM.getResult(F); auto &AA = AM.getResult(F); - return MemorySSAAnalysis::Result(std::make_unique(F, &AA, &DT)); + TaskInfo *TI = &AM.getResult(F); + return MemorySSAAnalysis::Result( + std::make_unique(F, &AA, &DT, TI)); } bool MemorySSAAnalysis::Result::invalidate( @@ -2376,7 +2420,8 @@ bool MemorySSAAnalysis::Result::invalidate( auto PAC = PA.getChecker(); return !(PAC.preserved() || PAC.preservedSet>()) || Inv.invalidate(F, PA) || - Inv.invalidate(F, PA); + Inv.invalidate(F, PA) || + Inv.invalidate(F, PA); } PreservedAnalyses MemorySSAPrinterPass::run(Function &F, @@ -2424,12 +2469,17 @@ void MemorySSAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequiredTransitive(); AU.addRequiredTransitive(); + // TODO: Add TaskInfoWrapperPass to lib/Analysis/LoopPass.cpp to make this + // work? + if (RequireTI || EnableDRF) + AU.addRequiredTransitive(); } bool MemorySSAWrapperPass::runOnFunction(Function &F) { auto &DT = getAnalysis().getDomTree(); auto &AA = getAnalysis().getAAResults(); - MSSA.reset(new MemorySSA(F, &AA, &DT)); + auto &TI = getAnalysis().getTaskInfo(); + MSSA.reset(new MemorySSA(F, &AA, &DT, &TI)); return false; } diff --git a/llvm/lib/Analysis/MustExecute.cpp b/llvm/lib/Analysis/MustExecute.cpp index 904d30d0544654d..83cb19f87c97cde 100644 --- a/llvm/lib/Analysis/MustExecute.cpp +++ b/llvm/lib/Analysis/MustExecute.cpp @@ -14,6 +14,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/AssemblyAnnotationWriter.h" #include "llvm/IR/Dominators.h" @@ -253,10 +254,33 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop, return true; } +// Helper function to check if an instruction is guaranteed to execute in the +// task T containing it. +static bool isGuaranteedToExecuteInTask(const Instruction &Inst, + const DominatorTree *DT, + const Task *T) { + assert(T && T->encloses(Inst.getParent()) && "Inst is not in given task."); + // Examine all exiting blocks of the task. + for (const Spindle *S : + depth_first>(T->getEntrySpindle())) { + for (const BasicBlock *Exit : S->spindle_exits()) { + if (!T->isTaskExiting(Exit)) + continue; + + // If Inst does not dominate the exiting block, then it's not guaranteed + // to execute. + if (!DT->dominates(Inst.getParent(), Exit)) + return false; + } + } + return true; +} + /// Returns true if the instruction in a loop is guaranteed to execute at least /// once. bool SimpleLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT, + const TaskInfo *TI, const Loop *CurLoop) const { // If the instruction is in the header block for the loop (which is very // common), it is always guaranteed to dominate the exit blocks. Since this @@ -269,16 +293,75 @@ bool SimpleLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst, return !HeaderMayThrow || Inst.getParent()->getFirstNonPHIOrDbg() == &Inst; + // If the instruction is inside of a subtask, verify that it dominates the + // exits of the subtask, and use the corresponding detach to determine whether + // the instruction is guaranteed to execute. + bool InstGuaranteedToExecuteInSubtask = true; + const Instruction *RepInst = &Inst; + if (TI) { + const Task *LoopTask = TI->getTaskFor(CurLoop->getHeader()); + while (InstGuaranteedToExecuteInSubtask) { + const Task *T = TI->getTaskFor(RepInst->getParent()); + // If the representative instruction and loop are in the same task, we're + // done traversing subtasks. + if (T == LoopTask) + break; + + // Check if the instruction is guaranteed to execute in its task. + if (!isGuaranteedToExecuteInTask(*RepInst, DT, T)) + InstGuaranteedToExecuteInSubtask = false; + else + // Use the task's detach in place of the original instruction. + RepInst = T->getDetach(); + } + } + + // If a subtask was found in which the instruction is not guaranteed to + // execute, then the instruction is not guaranteed to execute. + if (!InstGuaranteedToExecuteInSubtask) + return false; + // If there is a path from header to exit or latch that doesn't lead to our // instruction's block, return false. - return allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT); + return allLoopPathsLeadToBlock(CurLoop, RepInst->getParent(), DT); } bool ICFLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT, + const TaskInfo *TI, const Loop *CurLoop) const { - return !ICF.isDominatedByICFIFromSameBlock(&Inst) && - allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT); + if (ICF.isDominatedByICFIFromSameBlock(&Inst)) + return false; + + // If the instruction is inside of a subtask, verify that it dominates the + // exits of the subtask, and use the corresponding detach to determine whether + // the instruction is guaranteed to execute. + bool InstGuaranteedToExecuteInSubtask = true; + const Instruction *RepInst = &Inst; + if (TI) { + const Task *LoopTask = TI->getTaskFor(CurLoop->getHeader()); + while (InstGuaranteedToExecuteInSubtask) { + const Task *T = TI->getTaskFor(RepInst->getParent()); + // If the representative instruction and loop are in the same task, we're + // done traversing subtasks. + if (T == LoopTask) + break; + + // Check if the instruction is guaranteed to execute in its task. + if (!isGuaranteedToExecuteInTask(*RepInst, DT, T)) + InstGuaranteedToExecuteInSubtask = false; + else + // Use the task's detach in place of the original instruction. + RepInst = T->getDetach(); + } + } + + // If a subtask was found in which the instruction is not guaranteed to + // execute, then the instruction is not guaranteed to execute. + if (!InstGuaranteedToExecuteInSubtask) + return false; + + return allLoopPathsLeadToBlock(CurLoop, RepInst->getParent(), DT); } bool ICFLoopSafetyInfo::doesNotWriteMemoryBefore(const BasicBlock *BB, @@ -309,13 +392,14 @@ bool ICFLoopSafetyInfo::doesNotWriteMemoryBefore(const Instruction &I, doesNotWriteMemoryBefore(BB, CurLoop); } -static bool isMustExecuteIn(const Instruction &I, Loop *L, DominatorTree *DT) { +static bool isMustExecuteIn(const Instruction &I, Loop *L, DominatorTree *DT, + TaskInfo *TI) { // TODO: merge these two routines. For the moment, we display the best // result obtained by *either* implementation. This is a bit unfair since no // caller actually gets the full power at the moment. SimpleLoopSafetyInfo LSI; LSI.computeLoopSafetyInfo(L); - return LSI.isGuaranteedToExecute(I, DT, L) || + return LSI.isGuaranteedToExecute(I, DT, TI, L) || isGuaranteedToExecuteForEveryIteration(&I, L); } @@ -327,11 +411,11 @@ class MustExecuteAnnotatedWriter : public AssemblyAnnotationWriter { public: MustExecuteAnnotatedWriter(const Function &F, - DominatorTree &DT, LoopInfo &LI) { + DominatorTree &DT, LoopInfo &LI, TaskInfo &TI) { for (const auto &I: instructions(F)) { Loop *L = LI.getLoopFor(I.getParent()); while (L) { - if (isMustExecuteIn(I, L, &DT)) { + if (isMustExecuteIn(I, L, &DT, &TI)) { MustExec[&I].push_back(L); } L = L->getParentLoop(); @@ -339,12 +423,12 @@ class MustExecuteAnnotatedWriter : public AssemblyAnnotationWriter { } } MustExecuteAnnotatedWriter(const Module &M, - DominatorTree &DT, LoopInfo &LI) { + DominatorTree &DT, LoopInfo &LI, TaskInfo &TI) { for (const auto &F : M) for (const auto &I: instructions(F)) { Loop *L = LI.getLoopFor(I.getParent()); while (L) { - if (isMustExecuteIn(I, L, &DT)) { + if (isMustExecuteIn(I, L, &DT, &TI)) { MustExec[&I].push_back(L); } L = L->getParentLoop(); @@ -742,8 +826,9 @@ PreservedAnalyses MustExecutePrinterPass::run(Function &F, FunctionAnalysisManager &AM) { auto &LI = AM.getResult(F); auto &DT = AM.getResult(F); + auto &TI = AM.getResult(F); - MustExecuteAnnotatedWriter Writer(F, DT, LI); + MustExecuteAnnotatedWriter Writer(F, DT, LI, TI); F.print(OS, &Writer); return PreservedAnalyses::all(); } diff --git a/llvm/lib/Analysis/TapirRaceDetect.cpp b/llvm/lib/Analysis/TapirRaceDetect.cpp new file mode 100644 index 000000000000000..a3f6e63ee1f378f --- /dev/null +++ b/llvm/lib/Analysis/TapirRaceDetect.cpp @@ -0,0 +1,2206 @@ +//===- TapirRaceDetect.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// TapirRaceDetect is an LLVM pass that analyses Tapir tasks and dependences +// between memory accesses to find accesses that might race. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/TapirRaceDetect.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/SpecialCaseList.h" +#include "llvm/Support/VirtualFileSystem.h" + +using namespace llvm; + +#define DEBUG_TYPE "tapir-race-detect" + +static cl::opt + AssumeSafeMalloc( + "assume-safe-malloc", cl::init(true), cl::Hidden, + cl::desc("Assume that calls to allocation functions are safe.")); + +static cl::opt + IgnoreTerminationCalls( + "ignore-termination-calls", cl::init(true), cl::Hidden, + cl::desc("Ignore calls in program-terminating exit blocks.")); + +static cl::opt + MaxUsesToExploreCapture( + "max-uses-to-explore-capture", cl::init(unsigned(-1)), cl::Hidden, + cl::desc("Maximum number of uses to explore for a capture query.")); + +static cl::list ClABIListFiles( + "strat-ignorelist", + cl::desc("File listing native ABI functions and how the pass treats them"), + cl::Hidden); + +// Boilerplate for legacy and new pass managers + +TapirRaceDetect::Result +TapirRaceDetect::run(Function &F, FunctionAnalysisManager &FAM) { + auto &DT = FAM.getResult(F); + auto &LI = FAM.getResult(F); + auto &TI = FAM.getResult(F); + auto &DI = FAM.getResult(F); + auto &SE = FAM.getResult(F); + auto *TLI = &FAM.getResult(F); + return RaceInfo(&F, DT, LI, TI, DI, SE, TLI); +} + +AnalysisKey TapirRaceDetect::Key; + +INITIALIZE_PASS_BEGIN(TapirRaceDetectWrapperPass, "tapir-race-detect", + "Tapir Race Detection", true, true) +INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) +INITIALIZE_PASS_END(TapirRaceDetectWrapperPass, "tapir-race-detect", + "Tapir Race Detection", true, true) + +char TapirRaceDetectWrapperPass::ID = 0; + +TapirRaceDetectWrapperPass::TapirRaceDetectWrapperPass() : FunctionPass(ID) { + initializeTapirRaceDetectWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +bool TapirRaceDetectWrapperPass::runOnFunction(Function &F) { + auto &DT = getAnalysis().getDomTree(); + auto &LI = getAnalysis().getLoopInfo(); + auto &TI = getAnalysis().getTaskInfo(); + auto &DI = getAnalysis().getDI(); + auto &SE = getAnalysis().getSE(); + auto *TLI = &getAnalysis().getTLI(F); + Info.reset(new RaceInfo(&F, DT, LI, TI, DI, SE, TLI)); + return false; +} + +RaceInfo &TapirRaceDetectWrapperPass::getRaceInfo() const { return *Info; } + +void TapirRaceDetectWrapperPass::releaseMemory() { Info.reset(); } + +void TapirRaceDetectWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequiredTransitive(); + AU.addRequiredTransitive(); + AU.addRequiredTransitive(); + AU.addRequiredTransitive(); + AU.addRequired(); + AU.addRequiredTransitive(); +} + +FunctionPass *llvm::createTapirRaceDetectWrapperPass() { + return new TapirRaceDetectWrapperPass(); +} + +void TapirRaceDetectWrapperPass::print(raw_ostream &OS, + const Module *) const { + Info->print(OS); +} + +PreservedAnalyses +TapirRaceDetectPrinterPass::run(Function &F, FunctionAnalysisManager &FAM) { + OS << "'Tapir race detection' for function '" << F.getName() << "':\n"; + FAM.getResult(F).print(OS); + return PreservedAnalyses::all(); +} + +bool RaceInfo::invalidate(Function &F, const PreservedAnalyses &PA, + FunctionAnalysisManager::Invalidator &Inv) { + // Check whether the analysis, all analyses on functions, or the function's + // CFG have been preserved. + auto PAC = PA.getChecker(); + return !(PAC.preserved() || PAC.preservedSet>() || + Inv.invalidate(F, PA) || + Inv.invalidate(F, PA) || + Inv.invalidate(F, PA) || + Inv.invalidate(F, PA) || + Inv.invalidate(F, PA) || + Inv.invalidate(F, PA)); +} + +// Copied from DataFlowSanitizer.cpp +static StringRef GetGlobalTypeString(const GlobalValue &G) { + // Types of GlobalVariables are always pointer types. + Type *GType = G.getValueType(); + // For now we support ignoring struct types only. + if (StructType *SGType = dyn_cast(GType)) { + if (!SGType->isLiteral()) + return SGType->getName(); + } + return ""; +} + +namespace { + +// Copied and adapted from DataFlowSanitizer.cpp +class StratABIList { + std::unique_ptr SCL; + + public: + StratABIList() = default; + + void set(std::unique_ptr List) { SCL = std::move(List); } + + /// Returns whether either this function or its source file are listed in the + /// given category. + bool isIn(const Function &F, StringRef Category = StringRef()) const { + return isIn(*F.getParent(), Category) || + SCL->inSection("cilk", "fun", F.getName(), Category); + } + + /// Returns whether this type is listed in the given category. + bool isIn(const Type &Ty, StringRef Category = StringRef()) const { + const Type *ElTy = &Ty; + // We only handle struct types right now. + if (const StructType *STy = dyn_cast(ElTy)) + if (STy->hasName()) + return SCL->inSection("cilk", "type", STy->getName(), Category); + return false; + } + + bool isIn(const GlobalVariable &GV, StringRef Category = StringRef()) const { + return isIn(*GV.getParent(), Category) || + SCL->inSection("cilk", "global", GV.getName(), Category); + } + + /// Returns whether this global alias is listed in the given category. + /// + /// If GA aliases a function, the alias's name is matched as a function name + /// would be. Similarly, aliases of globals are matched like globals. + bool isIn(const GlobalAlias &GA, StringRef Category = StringRef()) const { + if (isIn(*GA.getParent(), Category)) + return true; + + if (isa(GA.getValueType())) + return SCL->inSection("cilk", "fun", GA.getName(), Category); + + return SCL->inSection("cilk", "global", GA.getName(), Category) || + SCL->inSection("cilk", "type", GetGlobalTypeString(GA), + Category); + } + + /// Returns whether this module is listed in the given category. + bool isIn(const Module &M, StringRef Category = StringRef()) const { + return SCL->inSection("cilk", "src", M.getModuleIdentifier(), Category); + } +}; + +// Structure to record the set of child tasks that might be in parallel with +// this spindle, ignoring back edges of loops. +// +// TODO: Improve this analysis to track the loop back edges responsible for +// specific maybe-parallel tasks. Use these back-edge tags to refine the +// dependence-analysis component of static race detection. Possible test case: +// intel/BlackScholes. +struct MaybeParallelTasksInLoopBody : public MaybeParallelTasks { + MPTaskListTy TaskList; + LoopInfo &LI; + + MaybeParallelTasksInLoopBody(LoopInfo &LI) : LI(LI) {} + + // This method performs the data-flow update computation on a given spindle. + bool evaluate(const Spindle *S, unsigned EvalNum) { + LLVM_DEBUG(dbgs() << "MPTInLoop::evaluate @ " << S->getEntry()->getName() + << "\n"); + if (!TaskList.count(S)) + TaskList.try_emplace(S); + + bool Complete = true; + for (const Spindle::SpindleEdge &PredEdge : S->in_edges()) { + const Spindle *Pred = PredEdge.first; + const BasicBlock *Inc = PredEdge.second; + + // If the incoming edge is a sync edge, get the associated sync region. + const Value *SyncRegSynced = nullptr; + if (const SyncInst *SI = dyn_cast(Inc->getTerminator())) + SyncRegSynced = SI->getSyncRegion(); + + // Skip back edges for this task list. + if (Loop *L = LI.getLoopFor(S->getEntry())) + if ((L->getHeader() == S->getEntry()) && L->contains(Inc)) + continue; + + // Iterate through the tasks in the task list for Pred. + for (const Task *MP : TaskList[Pred]) { + // Filter out any tasks that are synced by the sync region. + if (const DetachInst *DI = MP->getDetach()) + if (SyncRegSynced == DI->getSyncRegion()) + continue; + // Insert the task into this spindle's task list. If this task is a new + // addition, then we haven't yet reached the fixed point of this + // analysis. + if (TaskList[S].insert(MP).second) + Complete = false; + } + } + LLVM_DEBUG({ + dbgs() << " New MPT list for " << S->getEntry()->getName() + << (Complete ? " (complete)\n" : " (not complete)\n"); + for (const Task *MP : TaskList[S]) + dbgs() << " " << MP->getEntry()->getName() << "\n"; + }); + return Complete; + } +}; + +class AccessPtrAnalysis { +public: + /// Read or write access location. + // using MemAccessInfo = PointerIntPair; + using MemAccessInfo = RaceInfo::MemAccessInfo; + // using MemAccessInfoList = SmallVector; + // using AccessToUnderlyingObjMap = + // DenseMap>; + using AccessToUnderlyingObjMap = RaceInfo::AccessToUnderlyingObjMap; + + AccessPtrAnalysis(DominatorTree &DT, TaskInfo &TI, LoopInfo &LI, + DependenceInfo &DI, ScalarEvolution &SE, + const TargetLibraryInfo *TLI, + AccessToUnderlyingObjMap &AccessToObjs) + : DT(DT), TI(TI), LI(LI), DI(DI), AA(DI.getAA()), SE(SE), TLI(TLI), + AccessToObjs(AccessToObjs), MPTasksInLoop(LI) { + TI.evaluateParallelState(MPTasks); + + std::vector AllABIListFiles; + AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(), + ClABIListFiles.end()); + ABIList.set(SpecialCaseList::createOrDie(AllABIListFiles, + *vfs::getRealFileSystem())); + } + + void addFunctionArgument(Value *Arg); + void addAccess(Instruction *I); + + void processAccessPtrs(RaceInfo::ResultTy &Result, + RaceInfo::ObjectMRTy &ObjectMRForRace, + RaceInfo::PtrChecksTy &AllPtrRtChecks); + +private: + using PtrAccessSet = SetVector; + + void checkForRacesHelper(const Task *T, RaceInfo::ResultTy &Result, + RaceInfo::ObjectMRTy &ObjectMRForRace); + bool checkOpaqueAccesses(GeneralAccess &GA1, GeneralAccess &GA2); + void evaluateMaybeParallelAccesses(GeneralAccess &GA1, GeneralAccess &GA2, + RaceInfo::ResultTy &Result, + RaceInfo::ObjectMRTy &ObjectMRForRace); + bool checkDependence(std::unique_ptr D, GeneralAccess &GA1, + GeneralAccess &GA2); + // void getRTPtrChecks(Loop *L, RaceInfo::ResultTy &Result, + // RaceInfo::PtrChecksTy &AllPtrRtChecks); + + bool PointerCapturedBefore(const Value *Ptr, const Instruction *I, + unsigned MaxUsesToExplore) const; + + AliasResult underlyingObjectsAlias(const GeneralAccess &GAA, + const GeneralAccess &GAB); + + void recordLocalRace(const GeneralAccess &GA, RaceInfo::ResultTy &Result, + RaceInfo::ObjectMRTy &ObjectMRForRace, + const GeneralAccess &Competitor); + DominatorTree &DT; + TaskInfo &TI; + LoopInfo &LI; + DependenceInfo &DI; + AliasAnalysis *AA; + ScalarEvolution &SE; + + const TargetLibraryInfo *TLI; + SmallPtrSet ArgumentPtrs; + AccessToUnderlyingObjMap &AccessToObjs; + + MaybeParallelTasks MPTasks; + MaybeParallelTasksInLoopBody MPTasksInLoop; + + // A mapping of tasks to instructions in that task that might participate in a + // determinacy race. + using TaskAccessMapTy = DenseMap>; + TaskAccessMapTy TaskAccessMap; + + // A mapping of spindles to instructions in that spindle that might + // participate in a determinacy race. + using SpindleAccessMapTy = + DenseMap>; + SpindleAccessMapTy SpindleAccessMap; + + // A mapping of loops to instructions in that loop that might + // participate in a determinacy race. + using LoopAccessMapTy = DenseMap>; + LoopAccessMapTy LoopAccessMap; + + mutable DenseMap, bool> + MayBeCapturedCache; + + // /// We need to check that all of the pointers in this list are disjoint + // /// at runtime. Using std::unique_ptr to make using move ctor simpler. + // DenseMap AllPtrRtChecking; + + // ABI list to ignore. + StratABIList ABIList; +}; + +} // end anonymous namespace + +static bool isFreeFn(const Instruction *I, const TargetLibraryInfo *TLI) { + if (!isa(I)) + return false; + const CallBase *CB = dyn_cast(I); + + if (!TLI) + return false; + + if (getFreedOperand(CB, TLI)) + return true; + + // Ideally we would just use getFreedOperand to determine whether I is a call + // to a libfree funtion. But if -fno-builtin is used, then getFreedOperand + // won't recognize any libfree functions. For instrumentation purposes, + // it's sufficient to recognize the function name. + const StringRef FreeFnNames[] = { + "_ZdlPv", + "_ZdaPv", + "_ZdlPvj", + "_ZdlPvm", + "_ZdlPvRKSt9nothrow_t", + "_ZdlPvSt11align_val_t", + "_ZdaPvj", + "_ZdaPvm", + "_ZdaPvRKSt9nothrow_t", + "_ZdaPvSt11align_val_t", + "_ZdlPvSt11align_val_tRKSt9nothrow_t", + "_ZdaPvSt11align_val_tRKSt9nothrow_t", + "_ZdlPvjSt11align_val_t", + "_ZdlPvmSt11align_val_t", + "_ZdaPvjSt11align_val_t", + "_ZdaPvmSt11align_val_t", + "??3@YAXPAX@Z", + "??3@YAXPAXABUnothrow_t@std@@@Z", + "??3@YAXPAXI@Z", + "??3@YAXPEAX@Z", + "??3@YAXPEAXAEBUnothrow_t@std@@@Z", + "??3@YAXPEAX_K@Z", + "??_V@YAXPAX@Z", + "??_V@YAXPAXABUnothrow_t@std@@@Z", + "??_V@YAXPAXI@Z", + "??_V@YAXPEAX@Z", + "??_V@YAXPEAXAEBUnothrow_t@std@@@Z", + "??_V@YAXPEAX_K@Z", + "__kmpc_free_shared" + }; + + if (const Function *Called = CB->getCalledFunction()) { + StringRef FnName = Called->getName(); + if (!llvm::any_of(FreeFnNames, [&](const StringRef FreeFnName) { + return FnName == FreeFnName; + })) + return false; + + // Confirm that this function is a recognized library function + LibFunc F; + bool FoundLibFunc = TLI->getLibFunc(*Called, F); + return FoundLibFunc; + } + + return false; +} + +static bool isAllocFn(const Instruction *I, const TargetLibraryInfo *TLI) { + if (!isa(I)) + return false; + + if (!TLI) + return false; + + if (isAllocationFn(I, TLI)) + return true; + + // Ideally we would just use isAllocationFn to determine whether I is a call + // to an allocation funtion. But if -fno-builtin is used, then isAllocationFn + // won't recognize any allocation functions. For instrumentation purposes, + // it's sufficient to recognize the function name. + const StringRef AllocFnNames[] = { + "_Znwj", + "_ZnwjRKSt9nothrow_t", + "_ZnwjSt11align_val_t", + "_ZnwjSt11align_val_tRKSt9nothrow_t", + "_Znwm", + "_ZnwmRKSt9nothrow_t", + "_ZnwmSt11align_val_t", + "_ZnwmSt11align_val_tRKSt9nothrow_t", + "_Znaj", + "_ZnajRKSt9nothrow_t", + "_ZnajSt11align_val_t", + "_ZnajSt11align_val_tRKSt9nothrow_t", + "_Znam", + "_ZnamRKSt9nothrow_t", + "_ZnamSt11align_val_t", + "_ZnamSt11align_val_tRKSt9nothrow_t", + "??2@YAPAXI@Z", + "??2@YAPAXIABUnothrow_t@std@@@Z", + "??2@YAPEAX_K@Z", + "??2@YAPEAX_KAEBUnothrow_t@std@@@Z", + "??_U@YAPAXI@Z", + "??_U@YAPAXIABUnothrow_t@std@@@Z", + "??_U@YAPEAX_K@Z", + "??_U@YAPEAX_KAEBUnothrow_t@std@@@Z", + "strdup", + "dunder_strdup", + "strndup", + "dunder_strndup", + "__kmpc_alloc_shared", + "posix_memalign" + }; + + if (const Function *Called = dyn_cast(I)->getCalledFunction()) { + StringRef FnName = Called->getName(); + if (!llvm::any_of(AllocFnNames, [&](const StringRef AllocFnName) { + return FnName == AllocFnName; + })) + return false; + + // Confirm that this function is a recognized library function + LibFunc F; + bool FoundLibFunc = TLI->getLibFunc(*Called, F); + return FoundLibFunc; + } + + return false; +} + +static bool isAllocFn(const Value *V, const TargetLibraryInfo *TLI) { + if (const CallBase *CB = dyn_cast(V)) + return isAllocFn(CB, TLI); + return false; +} + +static bool isReallocFn(const CallBase *Call) { + return (static_cast( + Call->getFnAttr(Attribute::AllocKind).getValueAsInt()) & + AllocFnKind::Realloc) != AllocFnKind::Unknown; +} + +static bool checkInstructionForRace(const Instruction *I, + const TargetLibraryInfo *TLI) { + if (isa(I) || isa(I) || isa(I) || + isa(I) || isa(I) || + isa(I) || isa(I)) + return true; + + if (const CallBase *Call = dyn_cast(I)) { + // Ignore debug info intrinsics + if (isa(I)) + return false; + + if (const Function *Called = Call->getCalledFunction()) { + // Check for detached.rethrow, taskframe.resume, or sync.unwind, which + // might be invoked. + if (Intrinsic::detached_rethrow == Called->getIntrinsicID() || + Intrinsic::taskframe_resume == Called->getIntrinsicID() || + Intrinsic::sync_unwind == Called->getIntrinsicID()) + return false; + + // Ignore CSI and Cilksan functions + if (Called->hasName() && (Called->getName().startswith("__csi") || + Called->getName().startswith("__csan") || + Called->getName().startswith("__cilksan"))) + return false; + } + + // Ignore other intrinsics. + if (const IntrinsicInst *II = dyn_cast(I)) { + // Ignore intrinsics that do not access memory. + if (II->doesNotAccessMemory()) + return false; + // TODO: Exclude all intrinsics for which + // TTI::getIntrinsicCost() == TCC_Free? + switch (II->getIntrinsicID()) { + default: return true; + case Intrinsic::annotation: + case Intrinsic::assume: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::ptr_annotation: + case Intrinsic::var_annotation: + case Intrinsic::experimental_noalias_scope_decl: + case Intrinsic::syncregion_start: + case Intrinsic::taskframe_create: + case Intrinsic::taskframe_use: + case Intrinsic::taskframe_end: + case Intrinsic::taskframe_load_guard: + case Intrinsic::sync_unwind: + return false; + } + } + + // We can assume allocation functions are safe. + if (AssumeSafeMalloc && isAllocFn(Call, TLI)) { + return isReallocFn(Call); + } + + // If this call occurs in a termination block of the program, ignore it. + if (IgnoreTerminationCalls && + isa(I->getParent()->getTerminator())) { + const Function *CF = Call->getCalledFunction(); + // If this function call is indirect, we want to instrument it. + if (!CF) + return true; + // If this is an ordinary function call in a terminating block, ignore it. + if (!CF->hasFnAttribute(Attribute::NoReturn)) + return false; + // If this is a call to a terminating function, such as "exit" or "abort", + // ignore it. + if (CF->hasName() && + ((CF->getName() == "exit") || (CF->getName() == "abort") || + (CF->getName() == "__clang_call_terminate") || + (CF->getName() == "__assert_fail"))) + return false; + } + + // We want to instrument calls in general. + return true; + } + return false; +} + +// Get the general memory accesses for the instruction \p I, and stores those +// accesses into \p AccI. Returns true if general memory accesses could be +// derived for I, false otherwise. +static void GetGeneralAccesses( + Instruction *I, SmallVectorImpl &AccI, AliasAnalysis *AA, + const TargetLibraryInfo *TLI) { + // Handle common memory instructions + if (LoadInst *LI = dyn_cast(I)) { + MemoryLocation Loc = MemoryLocation::get(LI); + if (!AA->pointsToConstantMemory(Loc)) + AccI.push_back(GeneralAccess(LI, Loc, ModRefInfo::Ref)); + return; + } + if (StoreInst *SI = dyn_cast(I)) { + AccI.push_back(GeneralAccess(SI, MemoryLocation::get(SI), ModRefInfo::Mod)); + return; + } + // Handle atomic instructions + if (AtomicCmpXchgInst *CXI = dyn_cast(I)) { + AccI.push_back(GeneralAccess(CXI, MemoryLocation::get(CXI), + ModRefInfo::Mod)); + return; + } + if (AtomicRMWInst *RMWI = dyn_cast(I)) { + AccI.push_back(GeneralAccess(RMWI, MemoryLocation::get(RMWI), + ModRefInfo::Mod)); + return; + } + + // Handle VAArgs. + if (VAArgInst *VAAI = dyn_cast(I)) { + MemoryLocation Loc = MemoryLocation::get(VAAI); + if (!AA->pointsToConstantMemory(Loc)) + AccI.push_back(GeneralAccess(VAAI, Loc, ModRefInfo::ModRef)); + return; + } + + // Handle memory intrinsics. + if (AnyMemSetInst *MSI = dyn_cast(I)) { + AccI.push_back(GeneralAccess(MSI, MemoryLocation::getForDest(MSI), + ModRefInfo::Mod)); + return; + } + if (AnyMemTransferInst *MTI = dyn_cast(I)) { + AccI.push_back(GeneralAccess(MTI, MemoryLocation::getForDest(MTI), + 0, ModRefInfo::Mod)); + MemoryLocation Loc = MemoryLocation::getForSource(MTI); + if (!AA->pointsToConstantMemory(Loc)) + AccI.push_back(GeneralAccess(MTI, Loc, 1, ModRefInfo::Ref)); + return; + } + + // Handle arbitrary call sites by examining pointee arguments. + // + // This logic is based on that in AliasSetTracker.cpp. + if (const CallBase *Call = dyn_cast(I)) { + ModRefInfo CallMask = AA->getMemoryEffects(Call).getModRef(); + + // Some intrinsics are marked as modifying memory for control flow modelling + // purposes, but don't actually modify any specific memory location. + using namespace PatternMatch; + if (Call->use_empty() && + match(Call, m_Intrinsic())) + CallMask &= ModRefInfo::Ref; + // TODO: See if we need to exclude additional intrinsics. + + if (isAllocFn(Call, TLI)) { + // Handle realloc as a special case. + if (isReallocFn(Call)) { + // TODO: Try to get the size of the object being copied from. + AccI.push_back(GeneralAccess(I, MemoryLocation::getForArgument( + Call, 0, TLI), 0, + AA->getArgModRefInfo(Call, 0))); + // If we assume malloc is safe, don't worry about opaque accesses by + // realloc. + if (!AssumeSafeMalloc) + AccI.push_back(GeneralAccess(I, std::nullopt, CallMask)); + return; + } + } + + for (auto IdxArgPair : enumerate(Call->args())) { + int ArgIdx = IdxArgPair.index(); + const Value *Arg = IdxArgPair.value(); + if (!Arg->getType()->isPointerTy()) + continue; + MemoryLocation ArgLoc = + MemoryLocation::getForArgument(Call, ArgIdx, TLI); + if (AA->pointsToConstantMemory(ArgLoc)) + continue; + ModRefInfo ArgMask = AA->getArgModRefInfo(Call, ArgIdx); + ArgMask &= CallMask; + if (!isNoModRef(ArgMask)) { + AccI.push_back(GeneralAccess(I, ArgLoc, ArgIdx, ArgMask)); + } + } + + // If we find a free call and we assume malloc is safe, don't worry about + // opaque accesses by that free call. + if (AssumeSafeMalloc && getFreedOperand(Call, TLI)) + return; + + if (!Call->onlyAccessesArgMemory()) + // Add a generic GeneralAccess for this call to represent the fact that it + // might access arbitrary global memory. + AccI.push_back(GeneralAccess(I, std::nullopt, CallMask)); + return; + } +} + +void AccessPtrAnalysis::addFunctionArgument(Value *Arg) { + ArgumentPtrs.insert(Arg); +} + +void AccessPtrAnalysis::addAccess(Instruction *I) { + if (checkInstructionForRace(I, TLI)) { + + // Exclude calls to functions in ABIList. + if (const CallBase *Call = dyn_cast(I)) { + if (const Function *CF = Call->getCalledFunction()) + if (ABIList.isIn(*CF)) + return; + } else { + MemoryLocation Loc = MemoryLocation::get(I); + if (Loc.Ptr) { + if (const Value *UnderlyingObj = getUnderlyingObject(Loc.Ptr, 0)) { + if (const GlobalVariable *GV = + dyn_cast(UnderlyingObj)) + if (ABIList.isIn(*GV)) + return; + if (ABIList.isIn(*UnderlyingObj->getType())) + return; + } + } + } + + SmallVector GA; + GetGeneralAccesses(I, GA, DI.getAA(), TLI); + TaskAccessMap[TI.getTaskFor(I->getParent())].append(GA.begin(), GA.end()); + SpindleAccessMap[TI.getSpindleFor(I->getParent())].append(GA.begin(), + GA.end()); + if (Loop *L = LI.getLoopFor(I->getParent())) + LoopAccessMap[L].append(GA.begin(), GA.end()); + + for (GeneralAccess Acc : GA) { + // Skip this access if it does not have a valid pointer. + if (!Acc.getPtr()) + continue; + + MemAccessInfo Access(Acc.getPtr(), Acc.isMod()); + // DepCands.insert(Access); + + SmallVector Objects; + LLVM_DEBUG(dbgs() << "Getting underlying objects for " << *Acc.getPtr() + << "\n"); + getUnderlyingObjects(const_cast(Acc.getPtr()), Objects, &LI, 0); + for (const Value *Obj : Objects) { + LLVM_DEBUG(dbgs() << " Considering object: " << *Obj << "\n"); + // nullptr never alias, don't join sets for pointer that have "null" in + // their UnderlyingObjects list. + if (isa(Obj) && + !NullPointerIsDefined(I->getFunction(), + Obj->getType()->getPointerAddressSpace())) + continue; + + // Is this value a constant that cannot be derived from any pointer + // value (we need to exclude constant expressions, for example, that + // are formed from arithmetic on global symbols). + if (const Constant *C = dyn_cast(Obj)) { + // This check is derived from Transforms/Utils/InlineFunction.cpp + bool IsNonPtrConst = isa(C) || isa(C) || + isa(C) || isa(C) || + isa(C) || isa(C) || + isa(C); + if (IsNonPtrConst) + continue; + } + + if (const GlobalVariable *GV = dyn_cast(Obj)) + // Constant variables cannot race. + if (GV->isConstant()) + continue; + + if (isa(Obj)) + // Assume that functions are read-only + continue; + + LLVM_DEBUG(dbgs() << "Adding object for access:\n Obj: " << *Obj + << "\n Access: " << *Acc.getPtr() << "\n"); + AccessToObjs[Access].insert(Obj); + + // UnderlyingObjToAccessMap::iterator Prev = ObjToLastAccess.find(Obj); + // if (Prev != ObjToLastAccess.end()) + // DepCands.unionSets(Access, Prev->second); + + // ObjToLastAccess[Obj] = Access; + } + } + } +} + +static const Loop *getCommonLoop(const BasicBlock *B1, const BasicBlock *B2, + LoopInfo &LI) { + unsigned B1Level = LI.getLoopDepth(B1); + unsigned B2Level = LI.getLoopDepth(B2); + const Loop *L1 = LI.getLoopFor(B1); + const Loop *L2 = LI.getLoopFor(B2); + while (B1Level > B2Level) { + L1 = L1->getParentLoop(); + B1Level--; + } + while (B2Level > B1Level) { + L2 = L2->getParentLoop(); + B2Level--; + } + while (L1 != L2) { + L1 = L1->getParentLoop(); + L2 = L2->getParentLoop(); + } + return L1; +} + +static const Loop *getCommonLoop(const Loop *L, const BasicBlock *B, + LoopInfo &LI) { + unsigned L1Level = L->getLoopDepth(); + unsigned L2Level = LI.getLoopDepth(B); + const Loop *L1 = L; + const Loop *L2 = LI.getLoopFor(B); + while (L1Level > L2Level) { + L1 = L1->getParentLoop(); + L1Level--; + } + while (L2Level > L1Level) { + L2 = L2->getParentLoop(); + L2Level--; + } + while (L1 != L2) { + L1 = L1->getParentLoop(); + L2 = L2->getParentLoop(); + } + return L1; +} + +static const Spindle *GetRepSpindleInTask(const Spindle *S, const Task *T, + const TaskInfo &TI) { + const Task *Encl = T->getSubTaskEnclosing(S->getEntry()); + if (Encl->isRootTask()) + return S; + return TI.getSpindleFor(Encl->getDetach()->getContinue()); +} + +bool AccessPtrAnalysis::checkDependence(std::unique_ptr D, + GeneralAccess &GA1, + GeneralAccess &GA2) { + if (!D) { + LLVM_DEBUG(dbgs() << "No dependence\n"); + return false; + } + + LLVM_DEBUG({ + D->dump(dbgs()); + StringRef DepType = D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; + dbgs() << "Found " << DepType << " dependency between Src and Dst\n"; + unsigned Levels = D->getLevels(); + for (unsigned II = 1; II <= Levels; ++II) { + const SCEV *Distance = D->getDistance(II); + if (Distance) + dbgs() << "Level " << II << " distance " << *Distance << "\n"; + } + }); + + Instruction *I1 = GA1.I; + Instruction *I2 = GA2.I; + BasicBlock *B1 = I1->getParent(); + BasicBlock *B2 = I2->getParent(); + + // Only dependencies that cross tasks can produce determinacy races. + // Dependencies that cross loop iterations within the same task don't matter. + + // Find the deepest loop that contains both B1 and B2. + const Loop *CommonLoop = getCommonLoop(B1, B2, LI); + unsigned MaxLoopDepthToCheck = CommonLoop ? CommonLoop->getLoopDepth() : 0; + + // Check if dependence does not depend on looping. + if (0 == MaxLoopDepthToCheck) + // If there's no loop to worry about, then the existence of the dependence + // implies the potential for a race. + return true; + + // Use the base objects for the addresses to try to further refine the checks. + + // TODO: Use lifetime_begin intrinsics to further refine checks. + const Loop *CommonObjLoop = CommonLoop; + unsigned MinObjDepth = CommonLoop->getLoopDepth(); + SmallPtrSet BaseObjs; + MemAccessInfo MA1(GA1.getPtr(), GA1.isMod()); + MemAccessInfo MA2(GA2.getPtr(), GA2.isMod()); + for (const Value *Obj : AccessToObjs[MA1]) { + if (AccessToObjs[MA2].count(Obj)) + BaseObjs.insert(Obj); + else { + MinObjDepth = 0; + break; + } + } + for (const Value *Obj : AccessToObjs[MA2]) { + if (AccessToObjs[MA1].count(Obj)) + BaseObjs.insert(Obj); + else { + MinObjDepth = 0; + break; + } + } + + // If we didn't find any base objects, we have no common-object loop. + if (BaseObjs.empty()) + CommonObjLoop = nullptr; + + // Set MinObjDepth to 0 if there are not base objects to check. + if (BaseObjs.empty() || !CommonObjLoop) + MinObjDepth = 0; + + if (MinObjDepth != 0) { + for (const Value *Obj : BaseObjs) { + // If there are no more levels of common loop to check, return. + if (!CommonObjLoop) + break; + + LLVM_DEBUG(dbgs() << "Checking base object " << *Obj << "\n"); + assert(!(isa(Obj) && + !NullPointerIsDefined(B1->getParent(), + Obj->getType()->getPointerAddressSpace())) + && "nullptr in list of base objects"); + + // If the object is not an instruction, then there's no common loop to + // find. + if (!isa(Obj)) { + CommonObjLoop = nullptr; + break; + } + + // This optimization of bounding the loop nest to check only applies if + // the underlying objects perform an allocation. + const Instruction *ObjI = dyn_cast(Obj); + if (!isa(ObjI) && !isa(ObjI)) { + CommonObjLoop = nullptr; + break; + } + if (isa(ObjI)) + // Update the common loop for the underlying objects. + CommonObjLoop = getCommonLoop(CommonObjLoop, ObjI->getParent(), LI); + else if (const CallBase *CB = dyn_cast(ObjI)) { + if (!CB->returnDoesNotAlias()) { + CommonObjLoop = nullptr; + break; + } + // Update the common loop for the underlying objects. + CommonObjLoop = getCommonLoop(CommonObjLoop, ObjI->getParent(), LI); + } + } + } + // Save the depth of the common loop as the lower bound on the loop depth to + // check. + if (!CommonObjLoop) { + LLVM_DEBUG(dbgs() << "No common loop found for underlying objects.\n"); + MinObjDepth = 0; + } else + MinObjDepth = CommonObjLoop->getLoopDepth(); + + LLVM_DEBUG(dbgs() << "Min loop depth " << MinObjDepth << + " for underlying object.\n"); + + LLVM_DEBUG({ + if (MinObjDepth > MaxLoopDepthToCheck) { + dbgs() << "\tI1 " << *I1 << "\n\tI2 " << *I2; + dbgs() << "\n\tPtr1 " << *GA1.getPtr() + << " (null? " << (isa(GA1.getPtr())) << ")"; + dbgs() << "\n\tPtr2 " << *GA2.getPtr() + << " (null? " << (isa(GA2.getPtr())) << ")"; + dbgs() << "\n\tAddrspace " + << GA1.getPtr()->getType()->getPointerAddressSpace(); + dbgs() << "\n\tnullptr is defined? " + << (NullPointerIsDefined(B1->getParent())); + dbgs() << "\n\tMaxLoopDepthToCheck " << MaxLoopDepthToCheck; + dbgs() << "\n\tMinObjDepthToCheck " << MinObjDepth << "\n"; + } + }); + assert(MinObjDepth <= MaxLoopDepthToCheck && + "Minimum loop depth of underlying object cannot be greater " + "than maximum loop depth of dependence."); + + // Get the task that encloses both B1 and B2. + const Task *CommonTask = TI.getEnclosingTask(B1, B2); + // Get the representative spindles for both B1 and B2 in this common task. + const Spindle *I1Spindle = GetRepSpindleInTask(TI.getSpindleFor(B1), + CommonTask, TI); + const Spindle *I2Spindle = GetRepSpindleInTask(TI.getSpindleFor(B2), + CommonTask, TI); + // If this common loop does not contain the common task, then dependencies at + // the level of this common loop do not constitute a potential race. Find the + // loop that contains the enclosing task. + // + // Skip this step if either representative spindle is a shared-eh spindle, + // because those are more complicated. + if (!I1Spindle->isSharedEH() && !I2Spindle->isSharedEH()) { + if (!CommonLoop->contains(CommonTask->getEntry())) { + const Loop *CommonTaskLoop = LI.getLoopFor(CommonTask->getEntry()); + // Typically, CommonTaskLoop is a subloop of CommonLoop. But that doesn't + // have to be true, e.g., if CommonLoop appears in an exit of + // CommonTaskLoop. + CommonLoop = CommonTaskLoop; + } + // Update MaxLoopDepthToCheck + MaxLoopDepthToCheck = CommonLoop ? CommonLoop->getLoopDepth() : 0; + + // Check if dependence does not depend on looping. + if (0 == MaxLoopDepthToCheck) + MaxLoopDepthToCheck = MinObjDepth; + } + + if (MaxLoopDepthToCheck == MinObjDepth) { + LLVM_DEBUG(dbgs() << "Minimum object depth matches maximum loop depth.\n"); + if (TI.getTaskFor(B1) == TI.getTaskFor(B2)) + return false; + + // Check if dependence does not depend on looping. + if (0 == MaxLoopDepthToCheck) + // If there's no loop to worry about, then the existence of the dependence + // implies the potential for a race. + return true; + + if (!(D->getDirection(MaxLoopDepthToCheck) & Dependence::DVEntry::EQ)) + // Apparent dependence does not occur within the same iteration. + return false; + + // Check if the instructions are parallel when the loop backedge is excluded + // from dataflow. + for (const Task *MPT : MPTasksInLoop.TaskList[I1Spindle]) + if (TI.encloses(MPT, B2)) + return true; + for (const Task *MPT : MPTasksInLoop.TaskList[I2Spindle]) + if (TI.encloses(MPT, B1)) + return true; + + return false; + } + + // Get the whole loop stack to check above the common loop. + SmallVector LoopsToCheck; + const Loop *CurrLoop = CommonLoop; + while (CurrLoop) { + LoopsToCheck.push_back(CurrLoop); + CurrLoop = CurrLoop->getParentLoop(); + } + + // Check the loop stack from the top down until a loop is found where the + // dependence might cross parallel tasks. + unsigned MinLoopDepthToCheck = 1; + while (!LoopsToCheck.empty()) { + const Loop *CurrLoop = LoopsToCheck.pop_back_val(); + // If we're not yet at the minimum loop depth of the underlying object, go + // deeper. + if (MinLoopDepthToCheck < MinObjDepth) { + ++MinLoopDepthToCheck; + continue; + } + + // Check the maybe-parallel tasks for the spindle containing the loop + // header. + const Spindle *CurrSpindle = TI.getSpindleFor(CurrLoop->getHeader()); + bool MPTEnclosesDst = false; + for (const Task *MPT : MPTasks.TaskList[CurrSpindle]) { + if (TI.encloses(MPT, B2)) { + MPTEnclosesDst = true; + break; + } + } + + // If Dst is found in a maybe-parallel task, then the minimum loop depth has + // been found. + if (MPTEnclosesDst) + break; + // Otherwise go deeper. + ++MinLoopDepthToCheck; + } + + // Scan the loop nests in common from inside out. + for (unsigned II = MaxLoopDepthToCheck; II >= MinLoopDepthToCheck; --II) { + LLVM_DEBUG(dbgs() << "Checking loop level " << II << "\n"); + if (D->isScalar(II)) + return true; + if (D->getDirection(II) & unsigned(~Dependence::DVEntry::EQ)) + return true; + } + + LLVM_DEBUG(dbgs() << "Dependence does not cross parallel tasks.\n"); + return false; +} + +bool AccessPtrAnalysis::PointerCapturedBefore(const Value *Ptr, + const Instruction *I, + unsigned MaxUsesToExplore = + MaxUsesToExploreCapture) const { + const Value *StrippedPtr = Ptr->stripInBoundsOffsets(); + // Do not treat NULL pointers as captured. + if (isa(StrippedPtr)) + return false; + auto CaptureQuery = std::make_pair(StrippedPtr, I); + if (MayBeCapturedCache.count(CaptureQuery)) + return MayBeCapturedCache[CaptureQuery]; + + bool Result = false; + if (isa(StrippedPtr)) + // We assume that globals are captured. + // + // TODO: Possibly refine this check for private or internal globals. + Result = true; + else if (!isa(StrippedPtr)) { + // If we could strip the pointer, we conservatively assume it may be + // captured. + LLVM_DEBUG(dbgs() << "PointerCapturedBefore: Could not fully strip pointer " + << *Ptr << "\n"); + Result = true; + } else + Result = PointerMayBeCapturedBefore(StrippedPtr, false, false, I, &DT, true, + MaxUsesToExplore); + MayBeCapturedCache[CaptureQuery] = Result; + return Result; +} + +bool AccessPtrAnalysis::checkOpaqueAccesses(GeneralAccess &GA1, + GeneralAccess &GA2) { + // If neither instruction may write to memory, then no race is possible. + if (!GA1.I->mayWriteToMemory() && !GA2.I->mayWriteToMemory()) + return false; + + if (!GA1.Loc && !GA2.Loc) { + LLVM_DEBUG({ + const CallBase *Call1 = cast(GA1.I); + const CallBase *Call2 = cast(GA2.I); + + assert(!AA->doesNotAccessMemory(Call1) && + !AA->doesNotAccessMemory(Call2) && + "Opaque call does not access memory."); + assert(!AA->getMemoryEffects(Call1).onlyAccessesArgPointees() && + !AA->getMemoryEffects(Call2).onlyAccessesArgPointees() && + "Opaque call only accesses arg pointees."); + }); + // // If both calls only read memory, then there's no dependence. + // if (AA->onlyReadsMemory(Call1) && AA->onlyReadsMemory(Call2)) + // return false; + + // We have two logically-parallel calls that opaquely access memory, and at + // least one call modifies memory. Hence we have a dependnece and potential + // race. + return true; + } + + BasicBlock *B1 = GA1.I->getParent(); + BasicBlock *B2 = GA2.I->getParent(); + + // Get information about the non-opaque access. + const Value *Ptr; + Instruction *NonOpaque; + if (GA1.Loc) { + Ptr = GA1.getPtr(); + NonOpaque = GA1.I; + } else { // GA2.Loc + Ptr = GA2.getPtr(); + NonOpaque = GA2.I; + } + + // One access is opaque, while the other has a pointer. For the opaque access + // to race, the pointer must escape before the non-opaque instruction. + if (!PointerCapturedBefore(Ptr, NonOpaque)) + return false; + + // TODO: Use the instruction that performs the capture to further bound the + // subsequent loop checks. + + // Otherwise we check the logical parallelism of the access. Because one of + // the pointers is null, we assume that the "minimum object depth" is 0. + unsigned MinObjDepth = 0; + LLVM_DEBUG(dbgs() << "Min loop depth " << MinObjDepth + << " used for opaque accesses.\n"); + + // Find the deepest loop that contains both B1 and B2. + const Loop *CommonLoop = getCommonLoop(B1, B2, LI); + unsigned MaxLoopDepthToCheck = CommonLoop ? CommonLoop->getLoopDepth() : 0; + + // Check if dependence does not depend on looping. + if (0 == MaxLoopDepthToCheck) + // If there's no loop to worry about, then the existence of the dependence + // implies the potential for a race. + return true; + + LLVM_DEBUG( + if (MinObjDepth > MaxLoopDepthToCheck) { + dbgs() << "\tI1 " << *GA1.I << "\n\tI2 " << *GA2.I; + dbgs() << "\n\tMaxLoopDepthToCheck " << MaxLoopDepthToCheck; + dbgs() << "\n\tMinObjDepthToCheck " << MinObjDepth << "\n"; + dbgs() << *GA1.I->getFunction(); + }); + assert(MinObjDepth <= MaxLoopDepthToCheck && + "Minimum loop depth of underlying object cannot be greater " + "than maximum loop depth of dependence."); + + // Get the task that encloses both B1 and B2. + const Task *CommonTask = TI.getEnclosingTask(B1, B2); + // Get the representative spindles for both B1 and B2 in this common task. + const Spindle *I1Spindle = GetRepSpindleInTask(TI.getSpindleFor(B1), + CommonTask, TI); + const Spindle *I2Spindle = GetRepSpindleInTask(TI.getSpindleFor(B2), + CommonTask, TI); + // If this common loop does not contain the common task, then dependencies at + // the level of this common loop do not constitute a potential race. Find the + // loop that contains the enclosing task. + // + // Skip this step if either representative spindle is a shared-eh spindle, + // because those are more complicated. + if (!I1Spindle->isSharedEH() && !I2Spindle->isSharedEH()) { + if (!CommonLoop->contains(CommonTask->getEntry())) { + const Loop *CommonTaskLoop = LI.getLoopFor(CommonTask->getEntry()); + // Typically, CommonTaskLoop is a subloop of CommonLoop. But that doesn't + // have to be true, e.g., if CommonLoop appears in an exit of + // CommonTaskLoop. + // assert((!CommonTaskLoop || CommonTaskLoop->contains(CommonLoop)) && + // "Loop for common task does not contain common loop."); + CommonLoop = CommonTaskLoop; + } + // Update MaxLoopDepthToCheck + MaxLoopDepthToCheck = CommonLoop ? CommonLoop->getLoopDepth() : 0; + + // Check if dependence does not depend on looping. + if (0 == MaxLoopDepthToCheck) + MaxLoopDepthToCheck = MinObjDepth; + } + + if (MaxLoopDepthToCheck == MinObjDepth) { + LLVM_DEBUG(dbgs() << "Minimum object depth matches maximum loop depth.\n"); + if (TI.getTaskFor(B1) == TI.getTaskFor(B2)) + return false; + + // Check if dependence does not depend on looping. + if (0 == MaxLoopDepthToCheck) + // If there's no loop to worry about, then the existence of the dependence + // implies the potential for a race. + return true; + + // Check if the instructions are parallel when the loop backedge is excluded + // from dataflow. + for (const Task *MPT : MPTasksInLoop.TaskList[I1Spindle]) + if (TI.encloses(MPT, B2)) + return true; + for (const Task *MPT : MPTasksInLoop.TaskList[I2Spindle]) + if (TI.encloses(MPT, B1)) + return true; + + return false; + } + + // The opaque access acts like a dependence across all iterations of any loops + // containing the accesses. + return true; +} + +static void setObjectMRForRace(RaceInfo::ObjectMRTy &ObjectMRForRace, + const Value *Ptr, ModRefInfo MRI) { + if (!ObjectMRForRace.count(Ptr)) + ObjectMRForRace[Ptr] = ModRefInfo::NoModRef; + ObjectMRForRace[Ptr] |= MRI; +} + +void AccessPtrAnalysis::recordLocalRace(const GeneralAccess &GA, + RaceInfo::ResultTy &Result, + RaceInfo::ObjectMRTy &ObjectMRForRace, + const GeneralAccess &Racer) { + Result.recordLocalRace(GA, Racer); + + if (!GA.getPtr()) + return; + + for (const Value *Obj : AccessToObjs[MemAccessInfo(GA.getPtr(), GA.isMod())]) { + if (GA.isMod()) + setObjectMRForRace(ObjectMRForRace, Obj, ModRefInfo::Ref); + setObjectMRForRace(ObjectMRForRace, Obj, ModRefInfo::Mod); + } +} + +static void recordAncestorRace(const GeneralAccess &GA, const Value *Ptr, + RaceInfo::ResultTy &Result, + RaceInfo::ObjectMRTy &ObjectMRForRace, + const GeneralAccess &Racer = GeneralAccess()) { + if (GA.isMod()) { + Result.recordRaceViaAncestorRef(GA, Racer); + setObjectMRForRace(ObjectMRForRace, Ptr, ModRefInfo::Ref); + } + Result.recordRaceViaAncestorMod(GA, Racer); + setObjectMRForRace(ObjectMRForRace, Ptr, ModRefInfo::Mod); +} + +static void recordOpaqueRace(const GeneralAccess &GA, const Value *Ptr, + RaceInfo::ResultTy &Result, + RaceInfo::ObjectMRTy &ObjectMRForRace, + const GeneralAccess &Racer = GeneralAccess()) { + if (GA.isMod()) { + Result.recordOpaqueRace(GA, Racer); + setObjectMRForRace(ObjectMRForRace, Ptr, ModRefInfo::Ref); + } + Result.recordOpaqueRace(GA, Racer); + setObjectMRForRace(ObjectMRForRace, Ptr, ModRefInfo::Mod); +} + +// Returns NoAlias/MayAliass/MustAlias for two memory locations based upon their +// underlaying objects. If LocA and LocB are known to not alias (for any reason: +// tbaa, non-overlapping regions etc), then it is known there is no dependecy. +// Otherwise the underlying objects are checked to see if they point to +// different identifiable objects. +AliasResult +AccessPtrAnalysis::underlyingObjectsAlias(const GeneralAccess &GAA, + const GeneralAccess &GAB) { + MemoryLocation LocA = *GAA.Loc; + MemoryLocation LocB = *GAB.Loc; + // Check the original locations (minus size) for noalias, which can happen for + // tbaa, incompatible underlying object locations, etc. + MemoryLocation LocAS = + MemoryLocation::getBeforeOrAfter(LocA.Ptr, LocA.AATags); + MemoryLocation LocBS = + MemoryLocation::getBeforeOrAfter(LocB.Ptr, LocB.AATags); + if (AA->alias(LocAS, LocBS) == AliasResult::NoAlias) + return AliasResult::NoAlias; + + // Check the underlying objects are the same + const Value *AObj = getUnderlyingObject(LocA.Ptr); + const Value *BObj = getUnderlyingObject(LocB.Ptr); + + // If the underlying objects are the same, they must alias + if (AObj == BObj) + return AliasResult::MustAlias; + + // We may have hit the recursion limit for underlying objects, or have + // underlying objects where we don't know they will alias. + if (!isIdentifiedObject(AObj) || !isIdentifiedObject(BObj)) { + if ((isIdentifiedObject(AObj) && !PointerCapturedBefore(AObj, GAB.I)) || + (isIdentifiedObject(BObj) && !PointerCapturedBefore(BObj, GAA.I))) + return AliasResult::NoAlias; + return AliasResult::MayAlias; + } + + // Otherwise we know the objects are different and both identified objects so + // must not alias. + return AliasResult::NoAlias; +} + +static bool isThreadLocalObject(const Value *V) { + if (const IntrinsicInst *II = dyn_cast(V)) + return Intrinsic::threadlocal_address == II->getIntrinsicID(); + if (const GlobalValue *GV = dyn_cast(V)) + return GV->isThreadLocal(); + return false; +} + +void AccessPtrAnalysis::evaluateMaybeParallelAccesses( + GeneralAccess &GA1, GeneralAccess &GA2, RaceInfo::ResultTy &Result, + RaceInfo::ObjectMRTy &ObjectMRForRace) { + // No race is possible if no access modifies. + if (!GA1.isMod() && !GA2.isMod()) + return; + + bool LocalRace = false; + if (!GA1.getPtr() || !GA2.getPtr()) { + LLVM_DEBUG({ + dbgs() << "Checking for race involving opaque access:\n" + << " GA1 =\n"; + if (GA1.getPtr()) + dbgs() << " Ptr:" << *GA1.getPtr() << "\n"; + else + dbgs() << " Ptr: null\n"; + dbgs() << " I:" << *GA1.I << "\n" + << " GA2 =\n"; + if (GA2.getPtr()) + dbgs() << " Ptr:" << *GA2.getPtr() << "\n"; + else + dbgs() << " Ptr: null\n"; + dbgs() << " I:" << *GA2.I << "\n";}); + if (checkOpaqueAccesses(GA1, GA2)) + LocalRace = true; + } else { + // If either GA has a nullptr, then skip the check, since nullptr's cannot + // alias. + Function *F = GA1.I->getFunction(); + if (isa(GA1.getPtr()) && + !NullPointerIsDefined( + F, GA1.getPtr()->getType()->getPointerAddressSpace())) + return; + if (isa(GA2.getPtr()) && + !NullPointerIsDefined( + F, GA2.getPtr()->getType()->getPointerAddressSpace())) + return; + + // If the underlying objects cannot alias, then skip the check. + if (AliasResult::NoAlias == underlyingObjectsAlias(GA1, GA2)) + return; + + // If both objects are thread-local, then skip the check. + if (isThreadLocalObject(GA1.getPtr()) && isThreadLocalObject(GA2.getPtr())) + return; + + LLVM_DEBUG( + dbgs() << "Checking for race from dependence:\n" + << " GA1 =\n" + << " Ptr:" << *GA1.getPtr() << "\n I:" << *GA1.I << "\n" + << " GA2 =\n" + << " Ptr:" << *GA2.getPtr() << "\n I:" << *GA2.I << "\n"); + if (checkDependence(DI.depends(&GA1, &GA2, true), GA1, GA2)) + LocalRace = true; + } + + if (LocalRace) { + LLVM_DEBUG(dbgs() << "Local race found:\n" + << " I1 =" << *GA1.I << "\n I2 =" << *GA2.I << "\n"); + recordLocalRace(GA1, Result, ObjectMRForRace, GA2); + recordLocalRace(GA2, Result, ObjectMRForRace, GA1); + } +} + +void AccessPtrAnalysis::checkForRacesHelper( + const Task *T, RaceInfo::ResultTy &Result, + RaceInfo::ObjectMRTy &ObjectMRForRace) { + SmallPtrSet Visited; + + // Now handle each spindle in this task. + for (const Spindle *S : + depth_first>(T->getEntrySpindle())) { + LLVM_DEBUG(dbgs() << "Testing Spindle@" << S->getEntry()->getName() + << "\n"); + for (GeneralAccess GA : SpindleAccessMap[S]) { + if (GA.getPtr()) { + LLVM_DEBUG({ + dbgs() << "GA Underlying objects:\n"; + for (const Value *Obj : + AccessToObjs[MemAccessInfo(GA.getPtr(), GA.isMod())]) + dbgs() << " " << *Obj << "\n"; + }); + for (const Value *Obj : + AccessToObjs[MemAccessInfo(GA.getPtr(), GA.isMod())]) { + if (isa(Obj)) + // Races on alloca'd objects are checked locally. + continue; + + if (AssumeSafeMalloc && isAllocFn(Obj, TLI)) + // Races on malloc'd objects are checked locally. + continue; + + if (const Argument *A = dyn_cast(Obj)) { + // Check if the attributes on the argument preclude a race with the + // caller. + if (A->hasByValAttr() || // A->hasNoAliasAttr() || + A->hasStructRetAttr() || A->hasInAllocaAttr()) + continue; + + // Otherwise record the possible race with an ancestor. + LLVM_DEBUG(dbgs() << "Setting race via ancestor:\n" + << " GA.I: " << *GA.I << "\n" + << " Arg: " << *A << "\n"); + recordAncestorRace(GA, A, Result, ObjectMRForRace); + continue; + } + + if (const GlobalVariable *GV = dyn_cast(Obj)) { + // Constant variables cannot race. + assert(!GV->isConstant() && "Constant GV should be excluded."); + if (GV->hasPrivateLinkage() || GV->hasInternalLinkage()) { + // Races are only possible with ancestor functions in this module. + LLVM_DEBUG(dbgs() << "Setting race via private/internal global:\n" + << " GA.I: " << *GA.I << "\n" + << " GV: " << *GV << "\n"); + // TODO: Add MAAPs for private and internal global variables. + recordAncestorRace(GA, GV, Result, ObjectMRForRace); + // recordOpaqueRace(GA, GV, Result, ObjectMRForRace); + } else { + // Record the possible opaque race. + LLVM_DEBUG(dbgs() << "Setting opaque race:\n" + << " GA.I: " << *GA.I << "\n" + << " GV: " << *GV << "\n"); + recordOpaqueRace(GA, GV, Result, ObjectMRForRace); + } + continue; + } + + if (isa(Obj)) { + // Record the possible opaque race. + LLVM_DEBUG(dbgs() << "Setting opaque race:\n" + << " GA.I: " << *GA.I << "\n" + << " Obj: " << *Obj << "\n"); + recordOpaqueRace(GA, Obj, Result, ObjectMRForRace); + continue; + } + + if (!isa(Obj)) { + dbgs() << "ALERT: Unexpected underlying object: " << *Obj << "\n"; + } + + // Record the possible opaque race. + LLVM_DEBUG(dbgs() << "Setting opaque race:\n" + << " GA.I: " << *GA.I << "\n" + << " Obj: " << *Obj << "\n"); + recordOpaqueRace(GA, Obj, Result, ObjectMRForRace); + } + } + } + for (const Task *MPT : MPTasks.TaskList[S]) { + LLVM_DEBUG(dbgs() << "Testing against Task@" << MPT->getEntry()->getName() + << "\n"); + for (const Task *SubMPT : depth_first(MPT)) + for (GeneralAccess GA1 : SpindleAccessMap[S]) + for (GeneralAccess GA2 : TaskAccessMap[SubMPT]) + evaluateMaybeParallelAccesses(GA1, GA2, Result, ObjectMRForRace); + } + // If a successor of this spindle belongs to a subtask, recursively process + // that subtask. + for (const Spindle *Succ : successors(S)) { + if (S->succInSubTask(Succ)) { + // Skip successor spindles we've seen before. + if (!Visited.insert(Succ).second) + continue; + checkForRacesHelper(Succ->getParentTask(), Result, ObjectMRForRace); + } + } + } +} + +// /// Check whether a pointer can participate in a runtime bounds check. +// /// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr +// /// by adding run-time checks (overflow checks) if necessary. +// static bool hasComputableBounds(PredicatedScalarEvolution &PSE, +// const ValueToValueMap &Strides, Value *Ptr, +// Loop *L, bool Assume) { +// const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); + +// // The bounds for loop-invariant pointer is trivial. +// if (PSE.getSE()->isLoopInvariant(PtrScev, L)) +// return true; + +// const SCEVAddRecExpr *AR = dyn_cast(PtrScev); + +// if (!AR && Assume) +// AR = PSE.getAsAddRec(Ptr); + +// if (!AR) +// return false; + +// return AR->isAffine(); +// } + +// /// Check whether a pointer address cannot wrap. +// static bool isNoWrap(PredicatedScalarEvolution &PSE, +// const ValueToValueMap &Strides, Value *Ptr, Type *AccessTy, +// Loop *L) { +// const SCEV *PtrScev = PSE.getSCEV(Ptr); +// if (PSE.getSE()->isLoopInvariant(PtrScev, L)) +// return true; + +// int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides); +// if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) +// return true; + +// return false; +// } + +// namespace { +// // This class is based on LoopAccessAnalysis, but is not focused on +// // vectorization. +// class RTPtrCheckAnalysis { +// public: +// using MemAccessInfo = PointerIntPair; +// using MemAccessInfoList = SmallVector; +// using DepCandidates = EquivalenceClasses; +// using UnderlyingObjToAccessMap = DenseMap; + +// RTPtrCheckAnalysis(Loop *L, RuntimePointerChecking &RtCheck, +// AliasAnalysis *AA, ScalarEvolution &SE) +// : TheLoop(L), RtCheck(RtCheck), PSE(SE, *L), AST(*AA) {} + +// void addAccess(GeneralAccess GA, bool IsReadOnlyPtr = false) { +// if (GA.getPtr()) { +// LLVM_DEBUG(dbgs() << "Adding access for RT pointer checking:\n" +// << " GA.I: " << *GA.I << "\n" +// << " GA.Ptr: " << *GA.getPtr() << "\n"); +// AST.add(GA.I); +// Value *Ptr = const_cast(GA.getPtr()); +// Accesses.insert(MemAccessInfo(Ptr, GA.isMod())); +// if (IsReadOnlyPtr) +// ReadOnlyPtr.insert(Ptr); +// collectStridedAccess(GA.I); +// } +// } +// void processAccesses( +// AccessPtrAnalysis::AccessToUnderlyingObjMap &AccessToObjs); +// bool canCheckPtrAtRT(bool ShouldCheckWrap = false); + +// /// Initial processing of memory accesses determined that we need to +// /// perform dependency checking. +// /// +// /// Note that this can later be cleared if we retry memcheck analysis without +// /// dependency checking (i.e. FoundNonConstantDistanceDependence). +// bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } + +// private: +// void collectStridedAccess(Value *MemAccess); +// bool createCheckForAccess(MemAccessInfo Access, +// DenseMap &DepSetId, +// unsigned &RunningDepId, unsigned ASId, +// bool ShouldCheckWrap, bool Assume); + +// /// The loop being checked. +// Loop *TheLoop; + +// /// The resulting RT check. +// RuntimePointerChecking &RtCheck; + +// SetVector Accesses; + +// /// List of accesses that need a further dependence check. +// MemAccessInfoList CheckDeps; + +// /// Set of pointers that are read only. +// SmallPtrSet ReadOnlyPtr; + +// // Sets of potentially dependent accesses - members of one set share an +// // underlying pointer. The set "CheckDeps" identfies which sets really need a +// // dependence check. +// DepCandidates DepCands; + +// /// The SCEV predicate containing all the SCEV-related assumptions. +// PredicatedScalarEvolution PSE; + +// /// An alias set tracker to partition the access set by underlying object and +// /// intrinsic property (such as TBAA metadata). +// AliasSetTracker AST; + +// /// Initial processing of memory accesses determined that we may need +// /// to add memchecks. Perform the analysis to determine the necessary checks. +// /// +// /// Note that, this is different from isDependencyCheckNeeded. When we retry +// /// memcheck analysis without dependency checking +// /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is +// /// cleared while this remains set if we have potentially dependent accesses. +// bool IsRTCheckAnalysisNeeded = false; + +// /// If an access has a symbolic strides, this maps the pointer value to +// /// the stride symbol. +// ValueToValueMap SymbolicStrides; + +// /// Set of symbolic strides values. +// SmallPtrSet StrideSet; +// }; +// } // end anonymous namespace + +// // This code is borrowed from LoopAccessAnalysis.cpp +// void RTPtrCheckAnalysis::collectStridedAccess(Value *MemAccess) { +// Value *Ptr = nullptr; +// if (LoadInst *LI = dyn_cast(MemAccess)) +// Ptr = LI->getPointerOperand(); +// else if (StoreInst *SI = dyn_cast(MemAccess)) +// Ptr = SI->getPointerOperand(); +// else +// return; + +// Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop); +// if (!Stride) +// return; + +// LLVM_DEBUG(dbgs() << "TapirRD: Found a strided access that is a candidate " +// "for versioning:"); +// LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n"); + +// // Avoid adding the "Stride == 1" predicate when we know that +// // Stride >= Trip-Count. Such a predicate will effectively optimize a single +// // or zero iteration loop, as Trip-Count <= Stride == 1. +// // +// // TODO: We are currently not making a very informed decision on when it is +// // beneficial to apply stride versioning. It might make more sense that the +// // users of this analysis (such as the vectorizer) will trigger it, based on +// // their specific cost considerations; For example, in cases where stride +// // versioning does not help resolving memory accesses/dependences, the +// // vectorizer should evaluate the cost of the runtime test, and the benefit +// // of various possible stride specializations, considering the alternatives +// // of using gather/scatters (if available). + +// const SCEV *StrideExpr = PSE.getSCEV(Stride); +// const SCEV *BETakenCount = PSE.getBackedgeTakenCount(); + +// // Match the types so we can compare the stride and the BETakenCount. +// // The Stride can be positive/negative, so we sign extend Stride; +// // The backdgeTakenCount is non-negative, so we zero extend BETakenCount. +// const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); +// uint64_t StrideTypeSize = DL.getTypeAllocSize(StrideExpr->getType()); +// uint64_t BETypeSize = DL.getTypeAllocSize(BETakenCount->getType()); +// const SCEV *CastedStride = StrideExpr; +// const SCEV *CastedBECount = BETakenCount; +// ScalarEvolution *SE = PSE.getSE(); +// if (BETypeSize >= StrideTypeSize) +// CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType()); +// else +// CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType()); +// const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount); +// // Since TripCount == BackEdgeTakenCount + 1, checking +// // Stride >= TripCount is equivalent to checking +// // Stride - BETakenCount > 0 +// if (SE->isKnownPositive(StrideMinusBETaken)) { +// LLVM_DEBUG( +// dbgs() << "TapirRD: Stride>=TripCount; No point in versioning as the " +// "Stride==1 predicate will imply that the loop executes " +// "at most once.\n"); +// return; +// } +// LLVM_DEBUG(dbgs() << "TapirRD: Found a strided access that we can version."); + +// SymbolicStrides[Ptr] = Stride; +// StrideSet.insert(Stride); +// } + +// // This code is based on AccessAnalysis::processMemAccesses() in +// // LoopAccessAnalysis.cpp. +// void RTPtrCheckAnalysis::processAccesses( +// AccessPtrAnalysis::AccessToUnderlyingObjMap &AccessToObjs) { +// // The AliasSetTracker has nicely partitioned our pointers by metadata +// // compatibility and potential for underlying-object overlap. As a result, we +// // only need to check for potential pointer dependencies within each alias +// // set. +// for (auto &AS : AST) { +// // Note that both the alias-set tracker and the alias sets themselves used +// // linked lists internally and so the iteration order here is deterministic +// // (matching the original instruction order within each set). + +// bool SetHasWrite = false; + +// // Map of pointers to last access encountered. +// UnderlyingObjToAccessMap ObjToLastAccess; + +// // Set of access to check after all writes have been processed. +// SetVector DeferredAccesses; + +// // Iterate over each alias set twice, once to process read/write pointers, +// // and then to process read-only pointers. +// for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { +// bool UseDeferred = SetIteration > 0; +// SetVector &S = UseDeferred ? DeferredAccesses : Accesses; + +// for (auto AV : AS) { +// Value *Ptr = AV.getValue(); +// LLVM_DEBUG(dbgs() << "Found pointer is alias set: " << *Ptr << "\n"); + +// // For a single memory access in AliasSetTracker, Accesses may contain +// // both read and write, and they both need to be handled for CheckDeps. +// for (auto AC : S) { +// LLVM_DEBUG(dbgs() << " Access pointer: " << *AC.getPointer() << "\n"); +// if (AC.getPointer() != Ptr) +// continue; + +// bool IsWrite = AC.getInt(); + +// // If we're using the deferred access set, then it contains only +// // reads. +// bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; +// if (UseDeferred && !IsReadOnlyPtr) +// continue; +// // Otherwise, the pointer must be in the PtrAccessSet, either as a +// // read or a write. +// assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || +// S.count(MemAccessInfo(Ptr, false))) && +// "Alias-set pointer not in the access set?"); + +// MemAccessInfo Access(Ptr, IsWrite); +// DepCands.insert(Access); + +// // Memorize read-only pointers for later processing and skip them in +// // the first round (they need to be checked after we have seen all +// // write pointers). Note: we also mark pointer that are not +// // consecutive as "read-only" pointers (so that we check +// // "a[b[i]] +="). Hence, we need the second check for "!IsWrite". +// if (!UseDeferred && IsReadOnlyPtr) { +// DeferredAccesses.insert(Access); +// continue; +// } + +// // If this is a write - check other reads and writes for conflicts. If +// // this is a read only check other writes for conflicts (but only if +// // there is no other write to the ptr - this is an optimization to +// // catch "a[i] = a[i] + " without having to do a dependence check). +// if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { +// CheckDeps.push_back(Access); +// IsRTCheckAnalysisNeeded = true; +// } + +// if (IsWrite) +// SetHasWrite = true; + +// for (const Value *Obj : AccessToObjs[ +// AccessPtrAnalysis::MemAccessInfo(Ptr, IsWrite)]) { +// UnderlyingObjToAccessMap::iterator Prev = +// ObjToLastAccess.find(Obj); +// if (Prev != ObjToLastAccess.end()) +// DepCands.unionSets(Access, Prev->second); + +// ObjToLastAccess[Obj] = Access; +// } +// } +// } +// } +// } +// } + +// // This code is borrowed from LoopAccessAnalysis.cpp +// bool RTPtrCheckAnalysis::createCheckForAccess( +// MemAccessInfo Access, Type *AccessTy, DenseMap &DepSetId, +// unsigned &RunningDepId, unsigned ASId, bool ShouldCheckWrap, bool Assume) { +// Value *Ptr = Access.getPointer(); + +// if (!hasComputableBounds(PSE, SymbolicStrides, Ptr, TheLoop, Assume)) +// return false; + +// // When we run after a failing dependency check we have to make sure +// // we don't have wrapping pointers. +// if (ShouldCheckWrap && +// !isNoWrap(PSE, SymbolicStrides, Ptr, AccessTy, TheLoop)) { +// auto *Expr = PSE.getSCEV(Ptr); +// if (!Assume || !isa(Expr)) +// return false; +// PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW); +// } + +// // The id of the dependence set. +// unsigned DepId; + +// if (isDependencyCheckNeeded()) { +// Value *Leader = DepCands.getLeaderValue(Access).getPointer(); +// unsigned &LeaderId = DepSetId[Leader]; +// if (!LeaderId) +// LeaderId = RunningDepId++; +// DepId = LeaderId; +// } else +// // Each access has its own dependence set. +// DepId = RunningDepId++; + +// bool IsWrite = Access.getInt(); +// RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, SymbolicStrides, PSE); +// LLVM_DEBUG(dbgs() << "TapirRD: Found a runtime check ptr:" << *Ptr << '\n'); + +// return true; +// } + +// // This code is borrowed from LoopAccessAnalysis.cpp +// bool RTPtrCheckAnalysis::canCheckPtrAtRT(bool ShouldCheckWrap) { +// // Find pointers with computable bounds. We are going to use this information +// // to place a runtime bound check. +// bool CanDoRT = true; + +// bool NeedRTCheck = false; +// if (!IsRTCheckAnalysisNeeded) return true; + +// bool IsDepCheckNeeded = isDependencyCheckNeeded(); + +// // We assign a consecutive id to access from different alias sets. +// // Accesses between different groups doesn't need to be checked. +// unsigned ASId = 1; +// for (auto &AS : AST) { +// int NumReadPtrChecks = 0; +// int NumWritePtrChecks = 0; +// bool CanDoAliasSetRT = true; + +// // We assign consecutive id to access from different dependence sets. +// // Accesses within the same set don't need a runtime check. +// unsigned RunningDepId = 1; +// DenseMap DepSetId; + +// SmallVector Retries; + +// for (auto A : AS) { +// Value *Ptr = A.getValue(); +// bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); +// MemAccessInfo Access(Ptr, IsWrite); + +// if (IsWrite) +// ++NumWritePtrChecks; +// else +// ++NumReadPtrChecks; + +// if (!createCheckForAccess(Access, DepSetId, RunningDepId, ASId, +// ShouldCheckWrap, false)) { +// LLVM_DEBUG(dbgs() << "TapirRD: Can't find bounds for ptr:" << *Ptr << '\n'); +// Retries.push_back(Access); +// CanDoAliasSetRT = false; +// } +// } + +// // If we have at least two writes or one write and a read then we need to +// // check them. But there is no need to checks if there is only one +// // dependence set for this alias set. +// // +// // Note that this function computes CanDoRT and NeedRTCheck independently. +// // For example CanDoRT=false, NeedRTCheck=false means that we have a pointer +// // for which we couldn't find the bounds but we don't actually need to emit +// // any checks so it does not matter. +// bool NeedsAliasSetRTCheck = false; +// if (!(IsDepCheckNeeded && CanDoAliasSetRT && RunningDepId == 2)) +// NeedsAliasSetRTCheck = (NumWritePtrChecks >= 2 || +// (NumReadPtrChecks >= 1 && NumWritePtrChecks >= 1)); + +// // We need to perform run-time alias checks, but some pointers had bounds +// // that couldn't be checked. +// if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) { +// // Reset the CanDoSetRt flag and retry all accesses that have failed. +// // We know that we need these checks, so we can now be more aggressive +// // and add further checks if required (overflow checks). +// CanDoAliasSetRT = true; +// for (auto Access : Retries) +// if (!createCheckForAccess(Access, DepSetId, RunningDepId, ASId, +// ShouldCheckWrap, /*Assume=*/true)) { +// CanDoAliasSetRT = false; +// break; +// } +// } + +// CanDoRT &= CanDoAliasSetRT; +// NeedRTCheck |= NeedsAliasSetRTCheck; +// ++ASId; +// } + +// // If the pointers that we would use for the bounds comparison have different +// // address spaces, assume the values aren't directly comparable, so we can't +// // use them for the runtime check. We also have to assume they could +// // overlap. In the future there should be metadata for whether address spaces +// // are disjoint. +// unsigned NumPointers = RtCheck.Pointers.size(); +// for (unsigned i = 0; i < NumPointers; ++i) { +// for (unsigned j = i + 1; j < NumPointers; ++j) { +// // Only need to check pointers between two different dependency sets. +// if (RtCheck.Pointers[i].DependencySetId == +// RtCheck.Pointers[j].DependencySetId) +// continue; +// // Only need to check pointers in the same alias set. +// if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId) +// continue; + +// Value *PtrI = RtCheck.Pointers[i].PointerValue; +// Value *PtrJ = RtCheck.Pointers[j].PointerValue; + +// unsigned ASi = PtrI->getType()->getPointerAddressSpace(); +// unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); +// if (ASi != ASj) { +// LLVM_DEBUG( +// dbgs() << "TapirRD: Runtime check would require comparison between" +// " different address spaces\n"); +// return false; +// } +// } +// } + +// if (NeedRTCheck && CanDoRT) +// RtCheck.generateChecks(DepCands, IsDepCheckNeeded); + +// LLVM_DEBUG(dbgs() << "TapirRD: We need to do " << RtCheck.getNumberOfChecks() +// << " pointer comparisons.\n"); + +// RtCheck.Need = NeedRTCheck; + +// bool CanDoRTIfNeeded = !NeedRTCheck || CanDoRT; +// if (!CanDoRTIfNeeded) +// RtCheck.reset(); +// return CanDoRTIfNeeded; +// } + +// void AccessPtrAnalysis::getRTPtrChecks(Loop *L, RaceInfo::ResultTy &Result, +// RaceInfo::PtrChecksTy &AllPtrRtChecks) { +// LLVM_DEBUG(dbgs() << "getRTPtrChecks: " << *L << "\n"); + +// AllPtrRtChecks[L] = std::make_unique(&SE); + +// RTPtrCheckAnalysis RPCA(L, *AllPtrRtChecks[L].get(), AA, SE); +// SmallPtrSet Seen; +// // First handle all stores +// for (GeneralAccess GA : LoopAccessMap[L]) { +// // Exclude accesses not involved in a local race +// if (!Result.count(GA.I) || +// !RaceInfo::isLocalRace(Result.getRaceType(GA.I))) +// continue; + +// if (GA.isMod()) { +// RPCA.addAccess(GA); +// if (GA.getPtr()) +// Seen.insert(GA.getPtr()); +// } +// } +// // Now handle loads, checking if any pointers are only read from +// for (GeneralAccess GA : LoopAccessMap[L]) { +// // Exclude accesses not involved in a local race +// if (!Result.count(GA.I) || +// !RaceInfo::isLocalRace(Result.getRaceType(GA.I))) +// continue; + +// if (!GA.isMod()) { +// if (!GA.getPtr()) +// RPCA.addAccess(GA); + +// RPCA.addAccess(GA, !Seen.count(GA.getPtr())); +// } +// } + +// RPCA.processAccesses(AccessToObjs); +// // TODO: Do something with CanDoRTIfNeeded +// } + +void AccessPtrAnalysis::processAccessPtrs( + RaceInfo::ResultTy &Result, RaceInfo::ObjectMRTy &ObjectMRForRace, + RaceInfo::PtrChecksTy &AllPtrRtChecks) { + TI.evaluateParallelState(MPTasks); + TI.evaluateParallelState(MPTasksInLoop); + + // using InstPtrPair = std::pair; + // SmallPtrSet Visited; + for (const Spindle *S : + depth_first(TI.getRootTask()->getEntrySpindle())) { + for (GeneralAccess GA : SpindleAccessMap[S]) { + // InstPtrPair Visit = + // std::make_pair(GA.I, GA.getPtr()); + // // Skip instructions we've already visited. + // if (!Visited.insert(Visit).second) + // continue; + + if (!GA.getPtr()) { + if (const CallBase *Call = dyn_cast(GA.I)) { + if (!Call->onlyAccessesArgMemory() && + !(AssumeSafeMalloc && + (isAllocFn(Call, TLI) || isFreeFn(Call, TLI)))) { + LLVM_DEBUG(dbgs() << "Setting opaque race:\n" + << " GA.I: " << *GA.I << "\n" + << " no explicit racer\n"); + Result.recordOpaqueRace(GA, GeneralAccess()); + } + } + } + + // Check for aliasing against the function arguments. + for (Value *ArgPtr : ArgumentPtrs) { + LLVM_DEBUG({ + dbgs() << "Checking instruction against arg pointer:\n" + << " GA.I: " << *GA.I << "\n" + << " Arg: " << *ArgPtr << "\n"; + }); + if (!GA.getPtr()) { + ModRefInfo MRI = + AA->getModRefInfo(GA.I, MemoryLocation::getBeforeOrAfter(ArgPtr)); + Argument *Arg = cast(ArgPtr); + if (isModSet(MRI) && !Arg->onlyReadsMemory()) { + LLVM_DEBUG(dbgs() << " Mod is set.\n"); + Result.recordRaceViaAncestorRef(GA, GeneralAccess()); + Result.recordRaceViaAncestorMod(GA, GeneralAccess()); + setObjectMRForRace(ObjectMRForRace, ArgPtr, ModRefInfo::ModRef); + } + if (isRefSet(MRI)) { + LLVM_DEBUG(dbgs() << " Ref is set.\n"); + Result.recordRaceViaAncestorMod(GA, GeneralAccess()); + setObjectMRForRace(ObjectMRForRace, ArgPtr, ModRefInfo::Mod); + } + } else { + MemoryLocation GALoc = *GA.Loc; + if (AA->alias(GALoc, MemoryLocation::getBeforeOrAfter(ArgPtr))) { + Argument *Arg = cast(ArgPtr); + if (GA.isMod() && !Arg->onlyReadsMemory()) { + LLVM_DEBUG(dbgs() << " Mod is set.\n"); + Result.recordRaceViaAncestorRef(GA, GeneralAccess()); + Result.recordRaceViaAncestorMod(GA, GeneralAccess()); + setObjectMRForRace(ObjectMRForRace, ArgPtr, ModRefInfo::ModRef); + } + if (GA.isRef()) { + LLVM_DEBUG(dbgs() << " Ref is set.\n"); + Result.recordRaceViaAncestorMod(GA, GeneralAccess()); + setObjectMRForRace(ObjectMRForRace, ArgPtr, ModRefInfo::Mod); + } + } + } + } + } + } + checkForRacesHelper(TI.getRootTask(), Result, ObjectMRForRace); + + // Based on preliminary experiments, it doesn't appear that getRTPtrChecks, + // which is adapted from LoopAccessAnalysis, comes up with enough runtime + // pointer checks often enough to be worthwhile. It might be worth revisiting + // this code later. + + // for (Loop *TopLevelLoop : LI) { + // for (Loop *L : depth_first(TopLevelLoop)) { + // PredicatedScalarEvolution PSE(SE, *L); + // if (canAnalyzeLoop(L, PSE)) + // getRTPtrChecks(L, Result, AllPtrRtChecks); + // } + // } +} + +RaceInfo::RaceInfo(Function *F, DominatorTree &DT, LoopInfo &LI, TaskInfo &TI, + DependenceInfo &DI, ScalarEvolution &SE, + const TargetLibraryInfo *TLI) + : F(F), DT(DT), LI(LI), TI(TI), DI(DI), SE(SE), TLI(TLI) { + analyzeFunction(); +} + +void RaceInfo::getObjectsFor(Instruction *I, + SmallPtrSetImpl &Objects) { + SmallVector GA; + GetGeneralAccesses(I, GA, DI.getAA(), TLI); + for (GeneralAccess Acc : GA) { + // Skip this access if it does not have a valid pointer. + if (!Acc.getPtr()) + continue; + + getObjectsFor(MemAccessInfo(Acc.getPtr(), Acc.isMod()), Objects); + } +} + +void RaceInfo::getObjectsFor(MemAccessInfo Access, + SmallPtrSetImpl &Objects) { + for (const Value *Obj : AccessToObjs[Access]) + Objects.insert(Obj); +} + +void RaceInfo::print(raw_ostream &OS) const { + if (Result.empty()) { + OS << "No possible races\n"; + return; + } + RaceType OverallRT = getOverallRaceType(); + OS << "Overall race type: "; + printRaceType(OverallRT, OS); + OS << "\n"; + for (auto Res : Result) { + OS << " Result: " << *Res.first << "\n"; + for (auto &RD : Res.second) { + if (RD.getPtr()) + OS << " ptr: " << *RD.getPtr(); + else + OS << " nullptr"; + OS << "\n"; + printRaceType(RD.Type, OS.indent(6)); + if (RD.Racer.isValid()) { + OS << "\n Racer:"; + OS << "\n I = " << *RD.Racer.I; + OS << "\n Loc = "; + if (!RD.Racer.Loc) + OS << "nullptr"; + else if (RD.Racer.Loc->Ptr == RD.getPtr()) + OS << "same pointer"; + else + OS << *RD.Racer.Loc->Ptr; + OS << "\n OperandNum = "; + if (RD.Racer.OperandNum == static_cast(-1)) + OS << "none"; + else + OS << RD.Racer.OperandNum; + OS << "\n ModRef = " << (RD.Racer.isMod() ? "Mod " : "") + << (RD.Racer.isRef() ? "Ref" : ""); + } + else + OS << "\n Opaque racer"; + OS << "\n"; + } + } + OS << "Underlying objects of races:\n"; + for (auto Res : ObjectMRForRace) { + OS << *Res.first << "\n "; + if (isModSet(Res.second)) + OS << " Mod"; + if (isRefSet(Res.second)) + OS << " Ref"; + OS << "\n"; + } +} + +// The main analysis routine. +void RaceInfo::analyzeFunction() { + LLVM_DEBUG(dbgs() << "Analyzing function '" << F->getName() << "'\n"); + + // At a high level, we need to identify pairs of instructions that might + // execute in parallel and alias. + + AccessPtrAnalysis APA(DT, TI, LI, DI, SE, TLI, AccessToObjs); + // Record pointer arguments to this function + for (Argument &Arg : F->args()) + if (Arg.getType()->isPtrOrPtrVectorTy()) + APA.addFunctionArgument(&Arg); + // TODO: Add global variables to APA. + + for (BasicBlock &BB : *F) { + for (Instruction &I : BB.instructionsWithoutDebug()) { + if (I.mayReadFromMemory() || I.mayWriteToMemory()) { + if (checkInstructionForRace(&I, TLI)) + APA.addAccess(&I); + } + } + } + + APA.processAccessPtrs(Result, ObjectMRForRace, AllPtrRtChecks); +} diff --git a/llvm/lib/Analysis/TapirTaskInfo.cpp b/llvm/lib/Analysis/TapirTaskInfo.cpp new file mode 100644 index 000000000000000..c89ad33133af307 --- /dev/null +++ b/llvm/lib/Analysis/TapirTaskInfo.cpp @@ -0,0 +1,1846 @@ +//===- TapirTaskInfo.cpp - Tapir task calculator --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the TapirTaskInfo class that is used to identify parallel +// tasks and spindles in Tapir. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/IteratedDominanceFrontier.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PrintPasses.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "task-info" + +// Statistics +STATISTIC(NumBasicBlocks, "Number of basic blocks analyzed."); +STATISTIC(NumTasks, "Number of tasks found."); +STATISTIC(NumSpindles, "Number of spindles found."); +STATISTIC(NumSharedEHSpindles, "Number of shared exception-handling spindles " + "found in this function."); +STATISTIC(NumBasicBlocksInPF, + "Number of basic blocks analyzed in parallel functions."); +STATISTIC(NumTasksInPF, "Number of tasks found in parallel functions."); +STATISTIC(NumSpindlesInPF, "Number of spindles found in parallel functions."); + +// Always verify taskinfo if expensive checking is enabled. +#ifdef EXPENSIVE_CHECKS +bool llvm::VerifyTaskInfo = true; +#else +bool llvm::VerifyTaskInfo = false; +#endif +static cl::opt + VerifyTaskInfoX("verify-task-info", cl::location(VerifyTaskInfo), + cl::Hidden, cl::desc("Verify task info (time consuming)")); + +static cl::opt PrintTaskFrameTree( + "print-taskframe-tree", cl::init(false), + cl::Hidden, cl::desc("Print tree of task frames.")); + +static cl::opt PrintMayHappenInParallel( + "print-may-happen-in-parallel", cl::init(false), + cl::Hidden, cl::desc("Print may-happen-in-parallel analysis results " + "derived from Tapir control flow.")); + +/// Returns the taskframe.create at the start of BB if one exists, nullptr +/// otherwise. +static const Instruction *getTaskFrameCreate(const BasicBlock *BB) { + if (const IntrinsicInst *II = dyn_cast(&BB->front())) + if (Intrinsic::taskframe_create == II->getIntrinsicID()) + return &BB->front(); + return nullptr; +} +static Instruction *getTaskFrameCreate(BasicBlock *BB) { + return const_cast( + getTaskFrameCreate(const_cast(BB))); +} + +static bool isCanonicalTaskFrameEnd(const Instruction *TFEnd) { + // Check that the last instruction in the basic block containing TFEnd is + // TFEnd. + const Instruction *Term = &TFEnd->getParent()->back(); + if (!Term || isa(Term) || isa(Term)) + return false; + + const Instruction *Prev = Term->getPrevNode(); + if (!Prev || Prev != TFEnd) + return false; + + return true; +} + +// Check if the given instruction is an intrinsic with the specified ID. If a +// value \p V is specified, then additionally checks that the first argument of +// the intrinsic matches \p V. This function matches the behavior of +// isTapirIntrinsic in Transforms/Utils/TapirUtils. +static bool isTapirIntrinsic(Intrinsic::ID ID, const Instruction *I, + const Value *V = nullptr) { + if (const CallBase *CB = dyn_cast(I)) + if (const Function *Called = CB->getCalledFunction()) + if (ID == Called->getIntrinsicID()) + if (!V || (V == CB->getArgOperand(0))) + return true; + return false; +} + +// Check if the basic block terminates a taskframe via a taskframe.end. +static bool endsUnassociatedTaskFrame(const BasicBlock *B) { + const Instruction *Prev = B->getTerminator()->getPrevNode(); + if (!Prev) + return false; + if (isTapirIntrinsic(Intrinsic::taskframe_end, Prev) && + isCanonicalTaskFrameEnd(Prev)) + return true; + return false; +} + +/// Checks if the given taskframe.create instruction is in canonical form. This +/// function mirrors the behavior of needToSplitTaskFrameCreate in +/// Transforms/Utils/TapirUtils. +static bool isCanonicalTaskFrameCreate(const Instruction *TFCreate) { + // If the taskframe.create is not the first instruction, split. + if (TFCreate != &TFCreate->getParent()->front()) + return false; + + // The taskframe.create is at the front of the block. Check that we have a + // single predecessor. + const BasicBlock *Pred = TFCreate->getParent()->getSinglePredecessor(); + if (!Pred) + return false; + + // Check that the single predecessor has a single successor. + if (!Pred->getSingleSuccessor()) + return false; + + // Check whether the single predecessor is terminated with a sync. + if (isa(Pred->getTerminator())) + return false; + + // If the taskframe.create has no users, ignore it. + if (TFCreate->user_empty()) + return false; + + // Check that the uses of the taskframe.create are canonical as well. + for (const User *U : TFCreate->users()) { + if (const Instruction *I = dyn_cast(U)) { + if (isTapirIntrinsic(Intrinsic::taskframe_use, I) || + isTapirIntrinsic(Intrinsic::taskframe_resume, I)) + return true; + if (isTapirIntrinsic(Intrinsic::taskframe_end, I)) + return isCanonicalTaskFrameEnd(I); + } + } + return true; +} + +/// Returns true if the given instruction performs a taskframe resume, false +/// otherwise. +static bool isDetachedRethrow(const Instruction *I, + const Value *SyncReg = nullptr) { + if (const InvokeInst *II = dyn_cast(I)) + if (const Function *Called = II->getCalledFunction()) + if (Intrinsic::detached_rethrow == Called->getIntrinsicID()) + if (!SyncReg || (SyncReg == II->getArgOperand(0))) + return true; + return false; +} + +/// Returns true if the given instruction performs a taskframe resume, false +/// otherwise. +static bool isTaskFrameResume(const Instruction *I, + const Value *TaskFrame = nullptr) { + if (const InvokeInst *II = dyn_cast(I)) + if (const Function *Called = II->getCalledFunction()) + if (Intrinsic::taskframe_resume == Called->getIntrinsicID()) + if (!TaskFrame || (TaskFrame == II->getArgOperand(0))) + return true; + return false; +} + +//===----------------------------------------------------------------------===// +// Spindle implementation +// + +/// Return true if this spindle is a shared EH spindle. +bool Spindle::isSharedEH() const { + return getParentTask()->containsSharedEH(this); +} + +/// Return true if this spindle is the continuation of a detached task. +bool Spindle::isTaskContinuation() const { + for (const Spindle *Pred : predecessors(this)) + if (predInDifferentTask(Pred)) + return true; + return false; +} + +/// Return true if the successor spindle Succ is part of the same task as this +/// spindle. +bool Spindle::succInSameTask(const Spindle *Succ) const { + // If this spindle is a shared EH spindle, the successor must be a shared EH + // spindle tracked by the same task. + if (isSharedEH()) + return (Succ->isSharedEH() && (getParentTask() == Succ->getParentTask())); + + // Otherwise we have an ordinary spindle. If this spindle and Succ are both + // properly contained in ParentTask, return true. + if (getParentTask()->contains(Succ)) + return true; + else { + // Otherwise, check if Succ is a shared EH spindle tracked by the parent of + // ParentTask. + return getParentTask()->isSharedEHExit(Succ); + } +} + +/// Return true if the successor spindle Succ is in a subtask of the task +/// containing this spindle. +bool Spindle::succInSubTask(const Spindle *Succ) const { + return (Succ->getParentTask()->getParentTask() == getParentTask()); +} + +/// Return the taskframe.create intrinsic at the start of the entry block of +/// this Spindle, or nullptr if no such intrinsic exists. +Value *Spindle::getTaskFrameCreate() const { + if (Instruction *TFCreate = ::getTaskFrameCreate(getEntry())) + if (isCanonicalTaskFrameCreate(TFCreate)) + return TFCreate; + return nullptr; +} + +/// Return the task associated with this taskframe, or nullptr of this spindle +/// is not a taskframe. +Task *Spindle::getTaskFromTaskFrame() const { + if (TaskFrameUser) return TaskFrameUser; + if (getParentTask()->getEntrySpindle() == this) return getParentTask(); + return nullptr; +} + +BasicBlock *Spindle::getTaskFrameContinuation() const { + // If this taskframe is used by a task, return that task's continuation. + if (TaskFrameUser) + return TaskFrameUser->getContinuationSpindle()->getEntry(); + + Value *TFCreate = getTaskFrameCreate(); + if (!TFCreate) + return nullptr; + // Scan the uses of the taskframe.create for a canonical taskframe.end. + for (User *U : TFCreate->users()) + if (Instruction *I = dyn_cast(U)) { + if (isTapirIntrinsic(Intrinsic::taskframe_end, I) && + isCanonicalTaskFrameEnd(I)) + return I->getParent()->getSingleSuccessor(); + } + return nullptr; +} + +//===----------------------------------------------------------------------===// +// Task implementation +// + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void Task::dump() const { print(dbgs()); } + +LLVM_DUMP_METHOD void Task::dumpVerbose() const { + print(dbgs(), /*Depth=*/0, /*Verbose=*/true); +} +#endif + +// Get the shared EH spindles that this task can exit to and append them to +// SpindleVec. +void Task::getSharedEHExits(SmallVectorImpl &SpindleVec) const { + if (isRootTask()) return; + + // NOTE: We assume that all shared-eh exit spindles are contained in ancestors + // of this task, which might not be true if the shared-eh exit spindles + // themselves detach. It's not clear how this case could arise in practice, + // however. + SmallPtrSet Ancestors; + bool tracksSharedEHSpindles = false; + const Task *Parent = this; + do { + Parent = Parent->getParentTask(); + Ancestors.insert(Parent); + tracksSharedEHSpindles |= Parent->tracksSharedEHSpindles(); + } while (!Parent->isRootTask()); + if (!tracksSharedEHSpindles) return; + + // Scan the successors of the spindles in this task to find shared EH exits. + SmallVector WorkList; + SmallPtrSet Visited; + for (Spindle *S : getSpindles()) + for (Spindle *Succ : successors(S)) + if (Succ->isSharedEH() && Ancestors.contains(Succ->getParentTask())) + WorkList.push_back(Succ); + + // Perform a DFS of the shared EH exits to push each one onto SpindleVec and + // continue searching for more shared EH exits. + while (!WorkList.empty()) { + Spindle *EHExit = WorkList.pop_back_val(); + if (!Visited.insert(EHExit).second) continue; + + // Push EHExit onto SpindleVec. + SpindleVec.push_back(EHExit); + + // Scan the successors of EHExit for more shared EH exits. + for (Spindle *Succ : successors(EHExit)) + if (Succ->isSharedEH()) + WorkList.push_back(Succ); + } +} + +/// Returns true if SharedEH is a shared EH exit of this task. +bool Task::isSharedEHExit(const Spindle *SharedEH) const { + if (isRootTask()) return false; + if (!SharedEH->isSharedEH()) return false; + + // NOTE: We assume that all shared-eh exit spindles are contained in ancestors + // of this task, which might not be true if the shared-eh exit spindles + // themselves detach. It's not clear how this case could arise in practice, + // however. + SmallPtrSet Ancestors; + bool tracksSharedEHSpindles = false; + const Task *Parent = this; + do { + Parent = Parent->getParentTask(); + Ancestors.insert(Parent); + tracksSharedEHSpindles |= Parent->tracksSharedEHSpindles(); + } while (!Parent->isRootTask()); + if (!tracksSharedEHSpindles) return false; + + Task *SharedEHParent = SharedEH->getParentTask(); + if (!Ancestors.contains(SharedEHParent)) + return false; + + // Scan the successors of the spindles in this task to find shared EH exits. + SmallVector WorkList; + SmallPtrSet Visited; + for (Spindle *S : getSpindles()) + for (Spindle *Succ : successors(S)) + if (SharedEH == Succ) + return true; + else if (Succ->isSharedEH() && Ancestors.contains(Succ->getParentTask())) + WorkList.push_back(Succ); + + // Perform a DFS of the shared EH exits to push each one onto SpindleVec and + // continue searching for more shared EH exits. + while (!WorkList.empty()) { + Spindle *EHExit = WorkList.pop_back_val(); + if (!Visited.insert(EHExit).second) continue; + + // Check if this exit is the shared EH exit we're looking for. + if (SharedEH == EHExit) + return true; + + // Scan the successors of EHExit for more shared EH exits. + for (Spindle *Succ : successors(EHExit)) + if (Succ->isSharedEH()) + WorkList.push_back(Succ); + } + + return false; +} + +//===----------------------------------------------------------------------===// +// TaskInfo implementation +// + +// Add the unassociated spindles to the task T in order of a DFS CFG traversal +// starting at the entry block of T. +static void +AssociateWithTask(TaskInfo *TI, Task *T, + SmallPtrSetImpl &UnassocSpindles) { + SmallVector WorkList; + SmallPtrSet Visited; + // Add the successor spindles of the entry block of T to the worklist. + Spindle *Entry = T->getEntrySpindle(); + for (BasicBlock *Exit : Entry->spindle_exits()) + for (BasicBlock *Child : successors(Exit)) + if (Spindle *S = TI->getSpindleFor(Child)) + if (UnassocSpindles.count(S)) + WorkList.push_back(S); + + // Perform a DFS CFG traversal of the spindles associated with task T, and add + // each spindle to T in that order. + while (!WorkList.empty()) { + Spindle *S = WorkList.pop_back_val(); + if (!Visited.insert(S).second) continue; + + // Add the spindle S to T. + LLVM_DEBUG(dbgs() << "Adding spindle@" << S->getEntry()->getName() + << " to task@" << Entry->getEntry()->getName() << "\n"); + TI->addSpindleToTask(S, T); + + // Add the successor spindles of S that are associated with T to the + // worklist. + for (BasicBlock *Exit : S->spindle_exits()) + for (BasicBlock *Child : successors(Exit)) + if (Spindle *S = TI->getSpindleFor(Child)) + if (UnassocSpindles.count(S)) + WorkList.push_back(S); + } + + // We can have remaining unassociated spindles when subtasks share + // exception-handling spindles. + for (Spindle *S : UnassocSpindles) + if (!Visited.count(S)) { + TI->addEHSpindleToTask(S, T); + ++NumSharedEHSpindles; + } + + assert(T->getNumSpindles() + T->getNumSharedEHSpindles() == + UnassocSpindles.size() + 1 && + "Not all unassociated spindles were associated with task."); +} + +// Add the unassociated blocks to the spindle S in order of a DFS CFG traversal +// starting at the entry block of S. +static void +AssociateWithSpindle(TaskInfo *TI, Spindle *S, + SmallPtrSetImpl &UnassocBlocks) { + SmallVector WorkList; + SmallPtrSet Visited; + // Add the successor blocks of the entry of S to the worklist. + for (BasicBlock *Child : successors(S->getEntry())) + if (UnassocBlocks.count(Child)) + WorkList.push_back(Child); + + // Perform a DFS CFG traversal of the blocks associated with spindle S, and + // add each block to S in that order. + while (!WorkList.empty()) { + BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) continue; + + // Add the block BB to S. + TI->addBlockToSpindle(*BB, S); + + // Add the successors of block BB that are associated with S to the + // worklist. + for (BasicBlock *Child : successors(BB)) + if (UnassocBlocks.count(Child)) + WorkList.push_back(Child); + } + + assert(S->getNumBlocks() == UnassocBlocks.size() + 1 && + "Not all unassociated blocks were associated with spindle."); +} + +// Helper function to add spindle edges to spindles. +static void computeSpindleEdges(TaskInfo *TI) { + // Walk all spindles in the CFG to find all spindle edges. + SmallVector WorkList; + SmallPtrSet Visited; + + WorkList.push_back(TI->getRootTask()->getEntrySpindle()); + while (!WorkList.empty()) { + Spindle *S = WorkList.pop_back_val(); + + if (!Visited.insert(S).second) continue; + + // Examine all outgoing CFG edges from this spindle and create a spindle + // edge for each one. Filter out self-edges. + for (BasicBlock *Exit : S->spindle_exits()) { + for (BasicBlock *SB : successors(Exit)) { + Spindle *Succ = TI->getSpindleFor(SB); + if (Succ != S) { + S->addSpindleEdgeTo(Succ, Exit); + // Add this successor spindle for processing. + WorkList.push_back(Succ); + } + } + } + } +} + +// Search the PHI nodes in BB for a user of Val. Return Val if no PHI node in +// BB uses Val. +static Value *FindUserAmongPHIs(Value *Val, BasicBlock *BB) { + for (PHINode &PN : BB->phis()) { + if (Val->getType() != PN.getType()) + continue; + for (Value *Incoming : PN.incoming_values()) + if (Incoming == Val) + return &PN; + } + return Val; +} + +// Helper function to record the normal and exceptional continuation spindles +// for each task. +static void recordContinuationSpindles(TaskInfo *TI) { + for (Task *T : post_order(TI->getRootTask())) { + if (T->isRootTask()) + continue; + + DetachInst *DI = T->getDetach(); + Spindle *S = TI->getSpindleFor(DI->getParent()); + + // Set the continuation spindle for the spawned task. + T->setContinuationSpindle(TI->getSpindleFor(DI->getContinue())); + + // If the detach has an unwind destination, set the exceptional continuation + // spindle for the spawned task. + if (DI->hasUnwindDest()) { + BasicBlock *Unwind = DI->getUnwindDest(); + // We also follow the use-def chain for the landingpad of the + // detach-unwind to determine the value of the landingpad in the + // exceptional continuation. + Value *LPadVal = Unwind->getLandingPadInst(); + // There should be no substantive code between the detach unwind and the + // exceptional continuation. Instead, we expect a sequence of basic + // blocks in the parent spindle S that merges control flow from different + // exception-handling code together. Each basic block in this sequence + // should have a unique successor, and the landingpad of the unwind + // destination should propagate to the exceptional continuation through + // PHI nodes in these blocks. + while (TI->getSpindleFor(Unwind) == S) { + assert(Unwind->getUniqueSuccessor() && + "Unwind destination of detach has many successors, but belongs to " + "the same spindle as the detach."); + Unwind = Unwind->getUniqueSuccessor(); + LPadVal = FindUserAmongPHIs(LPadVal, Unwind); + } + // Set the exceptional continuation spindle for this task. + Spindle *UnwindSpindle = TI->getSpindleFor(Unwind); + LLVM_DEBUG({ + // Check that Task T is indeed a predecessor of this unwind spindle. + bool TaskIsPredecessor = false; + for (Spindle *Pred : predecessors(UnwindSpindle)) { + if (TI->getTaskFor(Pred) == T) { + TaskIsPredecessor = true; + break; + } + } + if (!TaskIsPredecessor) + // Report that an unusual exceptional continuation was found. This + // can happen, for example, due to splitting of landing pads or when + // part of the CFG becomes disconnected due to function inlining. + dbgs() << "TaskInfo: Found exceptional continuation at " + << Unwind->getName() << " with no predecessors in task\n"; + }); + T->setEHContinuationSpindle(UnwindSpindle, LPadVal); + } + } +} + +static bool shouldCreateSpindleAtDetachUnwind(const BasicBlock *MaybeUnwind, + const TaskInfo &TI, + const DominatorTree &DT) { + // Check that MaybeUnwind is a detach-unwind block. + if (!MaybeUnwind->isLandingPad()) + return false; + const BasicBlock *Pred = MaybeUnwind->getSinglePredecessor(); + if (!Pred) { + unsigned NumReachablePredecessors = 0; + for (const BasicBlock *P : predecessors(MaybeUnwind)) { + if (DT.isReachableFromEntry(P)) { + ++NumReachablePredecessors; + Pred = P; + } + } + if (NumReachablePredecessors > 1) + return false; + } + if (!isa(Pred->getTerminator())) + return false; + + const BasicBlock *UnwindSpindleEntry = MaybeUnwind; + // First suppose that a more appropriate detach-unwind spindle entry exists + // later on the chain of unique successors of Unwind. Traverse this chain of + // unique successors of Unwind until we find a spindle entry. + while (!TI.getSpindleFor(UnwindSpindleEntry)) { + if (isa(UnwindSpindleEntry->getTerminator())) + // We found a sync instruction terminating a basic block along the chain + // of unique successors of Unwind. Such a sync instruction should appear + // within a detach-unwind spindle. + return true; + + const BasicBlock *Succ = UnwindSpindleEntry->getUniqueSuccessor(); + if (!Succ) + // We discovered a basic block without a unique successor before we found + // an appropriate detach-unwind spindle entry. Return true, so a new + // detach-unwind spindle entry will be created. + return true; + UnwindSpindleEntry = Succ; + } + + // Check the type of spindle discovered, to make sure it's appropriate for a + // detach-unwind spindle. + const Spindle *S = TI.getSpindleFor(UnwindSpindleEntry); + return !S->isPhi(); +} + +static bool isTaskFrameCreateSpindleEntry(const BasicBlock *B) { + if (const Instruction *TFCreate = getTaskFrameCreate(B)) + if (isCanonicalTaskFrameCreate(TFCreate)) + return true; + return false; +} + +void TaskInfo::analyze(Function &F, DominatorTree &DomTree) { + // We first compute defining blocks and IDFs based on the detach and sync + // instructions. + DenseMap BBNumbers; + unsigned NextBBNum = 0; + int64_t BBCount = 0, SpindleCount = 0, TaskCount = 0; + SmallPtrSet DefiningBlocks; + // Go through each block to figure out where tasks begin and where sync + // instructions occur. + for (BasicBlock &B : F) { + BBCount++; + BBNumbers[&B] = NextBBNum++; + if (&F.getEntryBlock() == &B) { + DefiningBlocks.insert(&B); + // Create a spindle and root task for the entry block. + Spindle *S = createSpindleWithEntry(&B, Spindle::SPType::Entry); + SpindleCount++; + RootTask = createTaskWithEntry(S, DomTree); + TaskCount++; + } + if (DetachInst *DI = dyn_cast(B.getTerminator())) { + BasicBlock *TaskEntry = DI->getDetached(); + DefiningBlocks.insert(TaskEntry); + // Create a new spindle and task. + Spindle *S = createSpindleWithEntry(TaskEntry, Spindle::SPType::Detach); + SpindleCount++; + createTaskWithEntry(S, DomTree); + TaskCount++; + + // Create a new Phi spindle for the task continuation. We do this + // explicitly to handle cases where the spawned task does not return + // (reattach). + BasicBlock *TaskContinue = DI->getContinue(); + DefiningBlocks.insert(TaskContinue); + if (!getSpindleFor(TaskContinue)) { + createSpindleWithEntry(TaskContinue, Spindle::SPType::Phi); + SpindleCount++; + } + + // Similarly, create a new Phi spindle for the task unwind. + if (DI->hasUnwindDest()) { + BasicBlock *TaskUnwind = DI->getUnwindDest(); + DefiningBlocks.insert(TaskUnwind); + if (!getSpindleFor(TaskUnwind)) { + createSpindleWithEntry(TaskUnwind, Spindle::SPType::Phi); + SpindleCount++; + } + } + } else if (isa(B.getTerminator())) { + BasicBlock *SPEntry = B.getSingleSuccessor(); + // For sync instructions, we mark the block containing the sync + // instruction as the defining block for the sake of calculating IDF's. + // If the successor of the sync has multiple predecessors, then we want to + // allow a phi node to be created starting at that block. + DefiningBlocks.insert(&B); + // Create a new spindle. The type of this spindle might change later, if + // we discover it requires a phi. + if (!getSpindleFor(SPEntry)) { + createSpindleWithEntry(SPEntry, Spindle::SPType::Sync); + SpindleCount++; + } + assert((getSpindleFor(SPEntry)->isSync() || + getSpindleFor(SPEntry)->isPhi()) && + "Discovered early a non-sync, non-phi spindle after sync"); + } + // Create new spindles based on taskframe instrinsics. We need only work + // about taskframe.create and taskframe.resume. + if (isTaskFrameCreateSpindleEntry(&B)) { + // This block starts with a taskframe.create. Mark is as a spindle entry. + DefiningBlocks.insert(&B); + if (!getSpindleFor(&B)) { + // Create a new spindle. + createSpindleWithEntry(&B, Spindle::SPType::Phi); + SpindleCount++; + } + } + if (endsUnassociatedTaskFrame(&B)) { + BasicBlock *SPEntry = B.getSingleSuccessor(); + // This block ends with a taskframe.end. Mark its successor as a spindle + // entry. + DefiningBlocks.insert(SPEntry); + if (!getSpindleFor(SPEntry)) { + // Create a new spindle. + createSpindleWithEntry(SPEntry, Spindle::SPType::Phi); + SpindleCount++; + } + } else if (isTaskFrameResume(B.getTerminator())) { + // This block ends with a taskframe.resume invocation. Mark the unwind + // destination as a spindle entry. + InvokeInst *II = cast(B.getTerminator()); + BasicBlock *ResumeDest = II->getUnwindDest(); + DefiningBlocks.insert(ResumeDest); + if (!getSpindleFor(ResumeDest)) { + createSpindleWithEntry(ResumeDest, Spindle::SPType::Phi); + SpindleCount++; + } + } + } + NumBasicBlocks += BBCount; + NumSpindles += SpindleCount; + NumTasks += TaskCount; + bool ParallelFunc = (DefiningBlocks.size() > 1); + if (ParallelFunc) { + NumBasicBlocksInPF += BBCount; + NumSpindlesInPF += SpindleCount; + NumTasksInPF += TaskCount; + } + LLVM_DEBUG({ + dbgs() << "DefiningBlocks:\n"; + for (BasicBlock *BB : DefiningBlocks) + dbgs() << " " << BB->getName() << "\n"; + }); + + // Compute IDFs to determine additional starting points of spindles, e.g., + // continuation points and other spindle PHI-nodes. + ForwardIDFCalculator IDFs(DomTree); + IDFs.setDefiningBlocks(DefiningBlocks); + SmallVector IDFBlocks; + IDFs.calculate(IDFBlocks); + + if (IDFBlocks.size() > 1) + llvm::sort(IDFBlocks, + [&BBNumbers](const BasicBlock *A, const BasicBlock *B) { + return BBNumbers.find(A)->second < BBNumbers.find(B)->second; + }); + + LLVM_DEBUG({ + dbgs() << "IDFBlocks:\n"; + for (BasicBlock *BB : IDFBlocks) + dbgs() << " " << BB->getName() << "\n"; + }); + + // Create spindles for all IDFBlocks. + for (BasicBlock *B : IDFBlocks) + if (Spindle *S = getSpindleFor(B)) { + assert((S->isSync() || S->isPhi()) && + "Phi spindle to be created on existing non-sync spindle"); + // Change the type of this spindle. + S->Ty = Spindle::SPType::Phi; + } else { + // Create a new spindle. + createSpindleWithEntry(B, Spindle::SPType::Phi); + ++NumSpindles; + if (ParallelFunc) + ++NumSpindlesInPF; + } + + // Use the following linear-time algorithm to partition the function's blocks + // into spindles, partition the spindles into tasks, and compute the tree of + // tasks in this function. + // + // -) A post-order traversal of the dominator tree looks for a spindle entry + // and creates a stack of blocks it finds along the way. + // + // -) Once a spindle entry is encountered, the blocks belonging to that + // spindle equal the suffix of the stack of found blocks that are all + // dominated by the spindle's entry. These blocks are removed from the stack + // and added to the spindle according to a DFS CFG traversal starting at the + // spindle's entry. + // + // -) Similarly, the post-order travesal of the dominator tree finds the set + // of spindles that make up each task. These spindles are collected and added + // to their enclosing task using the same algorithm as above. + // + // -) Finally, the post-order traversal of the dominator tree deduces the + // hierarchical nesting of tasks within the function. Subtasks are associated + // with their parent task whenever a task entry that dominates the previous + // task entry is encountered. + std::vector FoundBlocks; + SmallVector FoundSpindles; + SmallVector FoundTFCreates; + SmallVector UnassocTasks; + for (auto DomNode : post_order(DomTree.getRootNode())) { + BasicBlock *BB = DomNode->getBlock(); + // If a basic block is not a spindle entry, mark it found and continue. + if (!getSpindleFor(BB)) { + // Perform some rare, special-case handling of detach unwind blocks. + if (shouldCreateSpindleAtDetachUnwind(BB, *this, DomTree)) { + createSpindleWithEntry(BB, Spindle::SPType::Phi); + ++NumSpindles; + } else { + FoundBlocks.push_back(BB); + continue; + } + } + // This block is a spindle entry. + Spindle *S = getSpindleFor(BB); + + // Associated blocks dominated by spindle S with spindle S. + { + SmallPtrSet UnassocBlocks; + // Determine which found blocks are associated with this spindle. Because + // of the post-order tree traversal, these blocks form a suffix of + // FoundBlocks. + while (!FoundBlocks.empty()) { + BasicBlock *FB = FoundBlocks.back(); + if (DomTree.dominates(S->getEntry(), FB)) { + UnassocBlocks.insert(FB); + FoundBlocks.pop_back(); + } else + break; + } + + // Associate the unassociated blocks with spindle S. + if (!UnassocBlocks.empty()) + AssociateWithSpindle(this, S, UnassocBlocks); + } + + // Mark taskframe.create spindles found. + if (Value *TaskFrame = S->getTaskFrameCreate()) { + FoundTFCreates.push_back(S); + for (Task *SubT : reverse(UnassocTasks)) { + if (!DomTree.dominates(S->getEntry(), SubT->getEntry())) + break; + // If SubT uses the TaskFrame created in S, associate the two. + if (SubT->getTaskFrameUsed() == TaskFrame) { + AssociateTaskFrameWithUser(SubT, S); + break; + } + } + } + + // If this spindle is not an entry to a task, mark it found and continue. + if (!getTaskFor(S)) { + FoundSpindles.push_back(S); + continue; + } + // This spindle is a task entry. + Task *T = getTaskFor(S); + + // Associate spindles dominated by task T with task T. + { + SmallPtrSet UnassocSpindles; + // Determine which found spindles are associated with this task. Because + // of the post-order tree traversal, these spindles form a suffix of + // FoundSpindles. + while (!FoundSpindles.empty()) { + Spindle *FS = FoundSpindles.back(); + if (DomTree.dominates(T->getEntry(), FS->getEntry())) { + UnassocSpindles.insert(FS); + FoundSpindles.pop_back(); + } else + break; + } + // Associate the unassociated spindles with task T. + if (!UnassocSpindles.empty()) + AssociateWithTask(this, T, UnassocSpindles); + } + + // If the last task is dominated by this task, add the unassociated tasks as + // children of this task. + while (!UnassocTasks.empty()) { + Task *LastTask = UnassocTasks.back(); + if (!DomTree.dominates(T->getEntry(), LastTask->getEntry())) + break; + T->addSubTask(LastTask); + UnassocTasks.pop_back(); + } + UnassocTasks.push_back(T); + + // Add taskframe.create spindles as children of this task. + while (!FoundTFCreates.empty()) { + Spindle *TF = FoundTFCreates.back(); + if (!DomTree.dominates(T->getEntry(), TF->getEntry())) + break; + T->TaskFrameCreates.push_back(TF); + FoundTFCreates.pop_back(); + } + } + + // Populate the predecessors and successors of all spindles. + computeSpindleEdges(this); + + // Record continuation spindles for each task. + recordContinuationSpindles(this); + + if (PrintTaskFrameTree) + // Determine the subtasks of taskframes discovered. + findTaskFrameTree(); +} + +/// Recursive helper to traverse the spindles to discover the taskframe tree. +void TaskInfo::findTaskFrameTreeHelper( + Spindle *TFSpindle, SmallVectorImpl &ParentWorkList, + SmallPtrSetImpl &SubTFVisited) { + const Value *TFCreate = TFSpindle->getTaskFrameCreate(); + const Task *UserT = TFSpindle->getTaskFromTaskFrame(); + const Spindle *Continuation = nullptr; + const Spindle *EHContinuation = nullptr; + if (UserT) { + Continuation = UserT->getContinuationSpindle(); + EHContinuation = UserT->getEHContinuationSpindle(); + } else { + // This taskframe is not associated with a task. Examine the uses of the + // taskframe to determine its continuation and exceptional-continuation + // spindles. + for (const User *U : TFCreate->users()) { + if (const Instruction *I = dyn_cast(U)) { + if (isTapirIntrinsic(Intrinsic::taskframe_end, I) && + isCanonicalTaskFrameEnd(I)) + Continuation = getSpindleFor(I->getParent()->getSingleSuccessor()); + else if (isTaskFrameResume(I)) { + const InvokeInst *II = dyn_cast(I); + EHContinuation = getSpindleFor(II->getUnwindDest()); + } + } + } + } + + SmallVector WorkList; + SmallPtrSet Visited; + WorkList.push_back(TFSpindle); + while (!WorkList.empty()) { + Spindle *S = WorkList.pop_back_val(); + if (!Visited.insert(S).second) + continue; + + // Add S to the set of taskframe spindles. + TFSpindle->TaskFrameSpindles.insert(S); + + for (Spindle::SpindleEdge &SuccEdge : S->out_edges()) { + // If the successor spindle is itself a TaskFrameCreate spindle, add the + // subtask that uses it, and continue. + if (SuccEdge.first->getTaskFrameCreate()) { + Spindle *SubTF = SuccEdge.first; + if (!SubTFVisited.insert(SubTF).second) + continue; + + if (Task *SubTFUser = SubTF->getTaskFrameUser()) + // Add SubTFUser as a subtask of the taskframe spindle. + TFSpindle->TaskFrameSubtasks.insert(SubTFUser); + + // Add SubTF as a subtaskframe of the taskframe spindle. + TFSpindle->SubTaskFrames.insert(SubTF); + SubTF->TaskFrameParent = TFSpindle; + + // Recur into the new taskframe. + findTaskFrameTreeHelper(SubTF, WorkList, SubTFVisited); + continue; + } + + // Handle any spindles not in the same task as TFSpindle. + if (!TFSpindle->succInSameTask(SuccEdge.first)) + if (isa(SuccEdge.second->getTerminator())) { + Task *SubT = getTaskFor(SuccEdge.first); + if (SubT != UserT) { + // Add SubT as a subtask of the taskframe spindle. + TFSpindle->TaskFrameSubtasks.insert(SubT); + + // Add a spindle representing the subtask. + if (!SubT->getTaskFrameCreateSpindle()) { + Spindle *SubTF = SuccEdge.first; + // Add the subtask's entry spindle to the set of subtaskframes. + TFSpindle->SubTaskFrames.insert(SubTF); + SubTF->TaskFrameParent = TFSpindle; + + // Recur into the new taskframe. + findTaskFrameTreeHelper(SubTF, WorkList, SubTFVisited); + continue; + } else { + LLVM_DEBUG({ + if (!TFSpindle->SubTaskFrames.count(SuccEdge.first)) + dbgs() << "Search encountered subtask@" + << SubT->getEntry()->getName() << " with taskframe " + << "before that subtask's taskframe.create."; + }); + } + } + } + + // Add the normal continuation to parent worklist. + if (SuccEdge.first == Continuation) { + ParentWorkList.push_back(SuccEdge.first); + continue; + } + // Add the exception-handling continuation to the appropriate worklist. + if (SuccEdge.first == EHContinuation) { + // If TFSpindle corresponds to a taskframe.create associated with a + // task, push the successor onto our worklist. Otherwise push it onto + // the parent's worklist. + // + // TODO: Why do we ever push the EHContinuation onto our own worklist? + if (TFCreate && UserT) + WorkList.push_back(SuccEdge.first); + else + ParentWorkList.push_back(SuccEdge.first); + continue; + } + + Instruction *ExitTerm = SuccEdge.second->getTerminator(); + // Add landingpad successor of taskframe.resume to parent worklist. + if (isTaskFrameResume(ExitTerm, TFCreate)) { + if (SuccEdge.first->getEntry() == + cast(ExitTerm)->getUnwindDest()) + ParentWorkList.push_back(SuccEdge.first); + continue; + } + // Add landingpad successor of detached.rethrow to the appropriate worklist. + if (isDetachedRethrow(ExitTerm)) { + if (SuccEdge.first->getEntry() == + cast(ExitTerm)->getUnwindDest()) { + // If TFSpindle corresponds to a taskframe.create, push the successor + // onto our worklist. Otherwise push it onto the parent's worklist. + if (TFCreate) + WorkList.push_back(SuccEdge.first); + else + ParentWorkList.push_back(SuccEdge.first); + } + continue; + } + + WorkList.push_back(SuccEdge.first); + } + } +} + +/// Compute the spindles and subtasks contained in all taskframes. +void TaskInfo::findTaskFrameTree() { + // If we've already found the taskframe tree, don't recompute it. + if (ComputedTaskFrameTree) + return; + + SmallPtrSet SubTFVisited; + // Get the taskframe tree under each taskframe.create in the root task. + for (Spindle *TFSpindle : getRootTask()->taskframe_creates()) { + SmallVector WorkList; + if (!SubTFVisited.insert(TFSpindle).second) + continue; + findTaskFrameTreeHelper(TFSpindle, WorkList, SubTFVisited); + } + + // Get the taskframe tree under each subtask that does not have an associated + // taskframe.create. + for (Task *SubT : getRootTask()->subtasks()) { + // If this subtask uses a taskframe, then we should have discovered its + // taskframe tree already. + if (SubT->getTaskFrameUsed()) + continue; + SmallVector WorkList; + // Treat the entry spindle of the subtask as the taskframe spindle. + Spindle *TFSpindle = SubT->getEntrySpindle(); + if (!SubTFVisited.insert(TFSpindle).second) + continue; + findTaskFrameTreeHelper(TFSpindle, WorkList, SubTFVisited); + } + + // Discover taskframe roots for all tasks in the function. + for (Task *T : post_order(getRootTask())) { + // Find taskframe.creates in T that do not have parents in T, and add them + // as taskframe roots of T. + for (Spindle *TFSpindle : T->taskframe_creates()) { + if (Spindle *Parent = TFSpindle->getTaskFrameParent()) { + if (!T->contains(Parent)) + T->TaskFrameRoots.push_back(TFSpindle); + } else { + T->TaskFrameRoots.push_back(TFSpindle); + } + } + + // For any subtask of T that does not have a taskframe, add its entry + // spindle as a taskframe root. + for (Task *SubT : T->subtasks()) { + // If SubT does not have an associated taskframe, then we might need to + // mark it as a taskframe root. + if (!SubT->getTaskFrameUsed()) { + Spindle *EffectiveTF = SubT->getEntrySpindle(); + if (Spindle *Parent = EffectiveTF->getTaskFrameParent()) { + if (!T->contains(Parent)) + T->TaskFrameRoots.push_back(EffectiveTF); + } else { + T->TaskFrameRoots.push_back(EffectiveTF); + } + } + } + } + + // Record that the taskframe tree has been computed. + ComputedTaskFrameTree = true; +} + +/// Determine which blocks the value is live in. +/// +/// These are blocks which lead to uses. Knowing this allows us to avoid +/// inserting PHI nodes into blocks which don't lead to uses (thus, the inserted +/// phi nodes would be dead). +static void ComputeLiveInBlocks( + const AllocaInst *AI, + const SmallVectorImpl &UsingBlocks, + const SmallPtrSetImpl &DefBlocks, + SmallPtrSetImpl &LiveInBlocks) { + // To determine liveness, we must iterate through the predecessors of blocks + // where the def is live. Blocks are added to the worklist if we need to + // check their predecessors. Start with all the using blocks. + SmallVector LiveInBlockWorklist(UsingBlocks.begin(), + UsingBlocks.end()); + + // If any of the using blocks is also a definition block, check to see if the + // definition occurs before or after the use. If it happens before the use, + // the value isn't really live-in. + for (unsigned i = 0, e = LiveInBlockWorklist.size(); i != e; ++i) { + BasicBlock *BB = LiveInBlockWorklist[i]; + if (!DefBlocks.count(BB)) + continue; + + // Okay, this is a block that both uses and defines the value. If the first + // reference to the alloca is a def (store), then we know it isn't live-in. + for (BasicBlock::iterator I = BB->begin();; ++I) { + if (StoreInst *SI = dyn_cast(I)) { + if (SI->getOperand(1) != AI) + continue; + + // We found a store to the alloca before a load. The alloca is not + // actually live-in here. + LiveInBlockWorklist[i] = LiveInBlockWorklist.back(); + LiveInBlockWorklist.pop_back(); + --i; + --e; + break; + } + + if (LoadInst *LI = dyn_cast(I)) { + if (LI->getOperand(0) != AI) + continue; + + // Okay, we found a load before a store to the alloca. It is actually + // live into this block. + break; + } + } + } + + // Now that we have a set of blocks where the phi is live-in, recursively add + // their predecessors until we find the full region the value is live. + while (!LiveInBlockWorklist.empty()) { + BasicBlock *BB = LiveInBlockWorklist.pop_back_val(); + + // The block really is live in here, insert it into the set. If already in + // the set, then it has already been processed. + if (!LiveInBlocks.insert(BB).second) + continue; + + // Since the value is live into BB, it is either defined in a predecessor or + // live into it to. Add the preds to the worklist unless they are a + // defining block. + for (BasicBlock *P : predecessors(BB)) { + // The value is not live into a predecessor if it defines the value. + if (DefBlocks.count(P)) + continue; + + // Otherwise it is, add to the worklist. + LiveInBlockWorklist.push_back(P); + } + } +} + +// Check the set PHIBlocks if a PHI needs to be inserted in a task-continue +// block. +static bool needPhiInTaskContinue( + const TaskInfo &TI, const AllocaInst *AI, + SmallVectorImpl &PHIBlocks) { + // Determine which PHI nodes want to use a value from a detached predecessor. + // Because register state is not preserved across a reattach, these alloca's + // cannot be promoted. + for (unsigned i = 0, e = PHIBlocks.size(); i != e; ++i) { + const BasicBlock *BB = PHIBlocks[i]; + for (const_pred_iterator PI = pred_begin(BB), E = pred_end(BB); + PI != E; ++PI) { + const BasicBlock *P = *PI; + if (TI.getSpindleFor(BB) && TI.getSpindleFor(P) && + TI.getSpindleFor(BB)->predInDifferentTask(TI.getSpindleFor(P))) { + // TODO: Check if there's a store to this alloca in the task enclosing + // P. + LLVM_DEBUG(dbgs() << "Alloca " << *AI << " has use reattached from " << + P->getName() << "\n"); + return true; + } + } + } + return false; +} + +/// Check if a alloca AI is promotable based on uses in subtasks. +bool TaskInfo::isAllocaParallelPromotable(const AllocaInst *AIP) const { + if (getTaskFor(AIP->getParent())->isSerial()) return true; + + DominatorTree &DomTree = getRootTask()->DomTree; + AllocaInst *AI = const_cast(AIP); + SmallPtrSet DefBlocks; + SmallVector UsingBlocks; + const Spindle *OnlySpindle = getSpindleFor(AIP->getParent()); + bool OnlyUsedInOneSpindle = true; + + // As we scan the uses of the alloca instruction, keep track of stores, and + // decide whether all of the loads and stores to the alloca are within the + // same basic block. + for (auto UI = AI->user_begin(), E = AI->user_end(); UI != E;) { + Instruction *User = cast(*UI++); + if (StoreInst *SI = dyn_cast(User)) { + // Remember the basic blocks which define new values for the alloca + DefBlocks.insert(SI->getParent()); + } else if (LoadInst *LI = dyn_cast(User)) { + // Otherwise it must be a load instruction, keep track of variable reads. + UsingBlocks.push_back(LI->getParent()); + } else continue; + + if (OnlyUsedInOneSpindle) + if (getSpindleFor(User->getParent()) != OnlySpindle) + OnlyUsedInOneSpindle = false; + } + + // A spindle is guaranteed to execute as a serial unit. Hence, if an alloca + // is only used in a single spindle, it is safe to promote. + if (OnlyUsedInOneSpindle) return true; + + ForwardIDFCalculator IDF(DomTree); + // Determine which blocks the value is live in. These are blocks which lead + // to uses. + SmallPtrSet LiveInBlocks; + ComputeLiveInBlocks(AI, UsingBlocks, DefBlocks, LiveInBlocks); + // Filter out live-in blocks that are not dominated by the alloca. + if (AI->getParent() != DomTree.getRoot()) { + SmallVector LiveInToRemove; + for (BasicBlock *LiveIn : LiveInBlocks) + if (!DomTree.dominates(AI->getParent(), LiveIn)) + LiveInToRemove.push_back(LiveIn); + for (BasicBlock *ToRemove : LiveInToRemove) + LiveInBlocks.erase(ToRemove); + } + + // Determine which blocks need PHI nodes and see if we can optimize out some + // work by avoiding insertion of dead phi nodes. + IDF.setLiveInBlocks(LiveInBlocks); + IDF.setDefiningBlocks(DefBlocks); + SmallVector PHIBlocks; + IDF.calculate(PHIBlocks); + + return !needPhiInTaskContinue(*this, AI, PHIBlocks); +} + +// This method is called once per spindle during an initial DFS traversal of the +// spindle graph. +bool IsSyncedState::markDefiningSpindle(const Spindle *S) { + LLVM_DEBUG(dbgs() << "markDefiningSpindle @ " << *S << "\n"); + // Entry spindles, detach spindles, sync spindles, and continuation-Phi + // spindles all define their sync state directly. Other Phi spindles + // determine their sync state based on their predecessors. + switch (S->getType()) { + case Spindle::SPType::Entry: + case Spindle::SPType::Detach: + SyncedState[S] = SyncInfo::TaskEntry; + return true; + case Spindle::SPType::Sync: + SyncedState[S] = SyncInfo::Synced; + return true; + case Spindle::SPType::Phi: + if (S->isTaskContinuation()) { + SyncedState[S] = SyncInfo::Unsynced; + return true; + } + } + return false; +} + +// This method is called once per unevaluated spindle in an inverse-post-order +// walk of the spindle graph. +bool IsSyncedState::evaluate(const Spindle *S, unsigned EvalNum) { + LLVM_DEBUG(dbgs() << "evaluate @ " << *S << "\n"); + + // For the first evaluation, optimistically assume that we are synced. Any + // unsynced predecessor will clear this bit. + if (!EvalNum && !SyncedState.count(S)) { + SyncedState[S] = SyncInfo::Synced; + } + + for (const Spindle::SpindleEdge &PredEdge : S->in_edges()) { + const Spindle *Pred = PredEdge.first; + const BasicBlock *Inc = PredEdge.second; + + // During the first evaluation, if we have a loop amongst Phi spindles, then + // the predecessor might not be defined. Skip predecessors that aren't + // defined. + if (!EvalNum && !SyncedState.count(Pred)) { + SyncedState[S] = setIncomplete(SyncedState[S]); + continue; + } else + assert(SyncedState.count(Pred) && + "All predecessors should have synced states after first eval."); + + // If we find an unsynced predecessor that is not terminated by a sync + // instruction, then we must be unsynced. + if (isUnsynced(SyncedState[Pred]) && + !isa(Inc->getTerminator())) { + SyncedState[S] = setUnsynced(SyncedState[S]); + break; + } + } + // Because spindles are evaluated in each round in an inverse post-order + // traversal, two evaluations should suffice. If we have an incomplete synced + // state at the end of the first evaluation, then we conclude that it's synced + // at set it complete. + if (EvalNum && isIncomplete(SyncedState[S])) { + SyncedState[S] = setComplete(SyncedState[S]); + return true; + } + return !isIncomplete(SyncedState[S]); +} + +// This method is called once per spindle during an initial DFS traversal of +// the spindle graph. +bool MaybeParallelTasks::markDefiningSpindle(const Spindle *S) { + LLVM_DEBUG(dbgs() << "MaybeParallelTasks::markDefiningSpindle @ " + << S->getEntry()->getName() << "\n"); + switch (S->getType()) { + // Emplace empty task lists for Entry, Detach, and Sync spindles. + case Spindle::SPType::Entry: + case Spindle::SPType::Detach: + TaskList.try_emplace(S); + return true; + case Spindle::SPType::Sync: + return false; + case Spindle::SPType::Phi: { + // At task-continuation Phi's, initialize the task list with the detached + // task that reattaches to this continuation. + if (S->isTaskContinuation()) { + LLVM_DEBUG(dbgs() << " TaskCont spindle " << S->getEntry()->getName() + << "\n"); + for (const Spindle *Pred : predecessors(S)) { + LLVM_DEBUG(dbgs() << " pred spindle " + << Pred->getEntry()->getName() << "\n"); + if (S->predInDifferentTask(Pred)) + TaskList[S].insert(Pred->getParentTask()); + } + LLVM_DEBUG({ + for (const Task *MPT : TaskList[S]) + dbgs() << " Added MPT " << MPT->getEntry()->getName() << "\n"; + }); + return true; + } + return false; + } + } + return false; +} + +// This method is called once per unevaluated spindle in an inverse-post-order +// walk of the spindle graph. +bool MaybeParallelTasks::evaluate(const Spindle *S, unsigned EvalNum) { + LLVM_DEBUG(dbgs() << "MaybeParallelTasks::evaluate @ " + << S->getEntry()->getName() << "\n"); + if (!TaskList.count(S)) + TaskList.try_emplace(S); + + bool NoChange = true; + for (const Spindle::SpindleEdge &PredEdge : S->in_edges()) { + const Spindle *Pred = PredEdge.first; + const BasicBlock *Inc = PredEdge.second; + + // If the incoming edge is a sync edge, get the associated sync region. + const Value *SyncRegSynced = nullptr; + if (const SyncInst *SI = dyn_cast(Inc->getTerminator())) + SyncRegSynced = SI->getSyncRegion(); + + // Iterate through the tasks in the task list for Pred. + for (const Task *MP : TaskList[Pred]) { + // Filter out any tasks that are synced by the sync region. + if (const DetachInst *DI = MP->getDetach()) + if (SyncRegSynced == DI->getSyncRegion()) + continue; + // Insert the task into this spindle's task list. If this task is a new + // addition, then we haven't yet reached the fixed point of this analysis. + if (TaskList[S].insert(MP).second) + NoChange = false; + } + } + LLVM_DEBUG({ + dbgs() << " New MPT list for " << S->getEntry()->getName() + << " (NoChange? " << NoChange << ")\n"; + for (const Task *MP : TaskList[S]) + dbgs() << " " << MP->getEntry()->getName() << "\n"; + }); + return NoChange; +} + +raw_ostream &llvm::operator<<(raw_ostream &OS, const Spindle &S) { + S.print(OS); + return OS; +} + +bool TaskInfo::invalidate(Function &F, const PreservedAnalyses &PA, + FunctionAnalysisManager::Invalidator &) { + // Check whether the analysis, all analyses on functions, or the function's + // CFG have been preserved. + auto PAC = PA.getChecker(); + return !(PAC.preserved() || PAC.preservedSet>() || + PAC.preservedSet()); +} + +static const BasicBlock *getSingleNotUnreachableSuccessor( + const BasicBlock *BB) { + const BasicBlock *SingleSuccessor = nullptr; + for (const auto *Succ : children(BB)) { + if (isa(Succ->getFirstNonPHIOrDbgOrLifetime())) + continue; + if (!SingleSuccessor) + SingleSuccessor = Succ; + else + return nullptr; + } + return SingleSuccessor; +} + +/// Print spindle with all the BBs inside it. +void Spindle::print(raw_ostream &OS, bool Verbose) const { + if (getParentTask()->getEntrySpindle() == this) + OS << ""; + BasicBlock *Entry = getEntry(); + for (unsigned i = 0; i < getBlocks().size(); ++i) { + BasicBlock *BB = getBlocks()[i]; + if (BB == Entry) { + if (getTaskFrameCreate()) + OS << ""; + switch (Ty) { + case SPType::Entry: OS << ""; break; + case SPType::Detach: OS << ""; break; + case SPType::Sync: OS << ""; break; + case SPType::Phi: OS << ""; break; + } + } + if (!Verbose) { + if (i) OS << ","; + BB->printAsOperand(OS, false); + } else + OS << "\n"; + + if (isSpindleExiting(BB)) { + OS << ""; + if (isTaskFrameResume(BB->getTerminator())) + OS << ""; + else if (getParentTask()->isTaskExiting(BB)) { + if (isa(BB->getTerminator()) || + isa(BB->getTerminator())) + OS << ""; + else if (isa(BB->getTerminator()) || + isa(BB->getTerminator())) + OS << ""; + else if (getParentTask()->getEHContinuationSpindle() && + (getSingleNotUnreachableSuccessor(BB) == + getParentTask()->getEHContinuationSpindle()->getEntry())) + OS << ""; + else + OS << ""; + } + } + if (Verbose) + BB->print(OS); + } +} + +raw_ostream &llvm::operator<<(raw_ostream &OS, const Task &T) { + T.print(OS); + return OS; +} + +/// Print task with all the BBs inside it. +void Task::print(raw_ostream &OS, unsigned Depth, bool Verbose) const { + OS.indent(Depth * 2) << "task at depth " << Depth << ": "; + + // Print the spindles in this task. + for (const Spindle *S : + depth_first>(getEntrySpindle())) { + OS << "{"; + S->print(OS, Verbose); + OS << "}"; + } + OS << "\n"; + + // If this task contains tracks any shared EH spindles for its subtasks, print + // those shared EH spindles. + for (const Spindle *S : shared_eh_spindles()) { + OS << "{"; + S->print(OS, Verbose); + OS << "}\n"; + } + + // Print the subtasks of this task. + for (const Task *SubTask : getSubTasks()) + SubTask->print(OS, Depth+1, Verbose); +} + +static void printTaskFrame(raw_ostream &OS, const Spindle *TFEntry, + unsigned Depth, bool Verbose) { + OS.indent(Depth * 2) << "taskframe at depth " << Depth << ": "; + + OS << "spindle@" << TFEntry->getEntry()->getName(); + if (const Task *User = TFEntry->getTaskFromTaskFrame()) + OS << " (used by task@" << User->getEntry()->getName() << ")"; + OS << "\n"; + + for (const Spindle *SubTF : TFEntry->subtaskframes()) + printTaskFrame(OS, SubTF, Depth+1, Verbose); +} + +// Debugging +void TaskInfo::print(raw_ostream &OS) const { + OS << "Spindles:\n"; + SmallVector WorkList; + SmallPtrSet Visited; + WorkList.push_back(getRootTask()->getEntrySpindle()); + while (!WorkList.empty()) { + const Spindle *S = WorkList.pop_back_val(); + if (!Visited.insert(S).second) continue; + + OS << "{"; + S->print(OS); + OS << "}"; + + for (const Spindle *Succ : successors(S)) + WorkList.push_back(Succ); + } + OS << "\n\n"; + + OS << "Task tree:\n"; + getRootTask()->print(OS); + OS << "\n"; + + for (const Task *T : post_order(getRootTask())) { + if (T->taskframe_creates().begin() == T->taskframe_creates().end()) + continue; + OS << "task@" << T->getEntry()->getName() << " has taskframe.creates:\n"; + for (const Spindle *S : T->taskframe_creates()) { + OS << " spindle@" << S->getEntry()->getName() << "\n"; + // Print the task that uses this taskframe.create + if (S->getTaskFrameUser()) + OS << " used by task@" + << S->getTaskFrameUser()->getEntry()->getName() << "\n"; + else + OS << " not used.\n"; + + // Print the subtaskframess under this taskframe.create. + for (const Spindle *SubTF : S->subtaskframes()) + OS << " contains subtaskframe@" + << SubTF->getEntry()->getName() << "\n"; + + // Print the subtasks under this taskframe.create. + for (const Task *SubT : S->taskframe_subtasks()) + OS << " contains subtask@" + << SubT->getEntry()->getName() << "\n"; + + // Print the taskframe spindles themselves. + for (const Spindle *TFSpindle : S->taskframe_spindles()) + OS << " " << *TFSpindle << "\n"; + } + OS << "\n"; + } + + if (PrintTaskFrameTree) { + for (const Spindle *TFCreate : getRootTask()->taskframe_roots()) { + printTaskFrame(OS, TFCreate, 0, false); + OS << "\n"; + } + } + + if (PrintMayHappenInParallel) { + // Evaluate the tasks that might be in parallel with each spindle, and + // determine number of discriminating syncs: syncs that sync a subset of the + // detached tasks, based on sync regions. + MaybeParallelTasks MPTasks; + evaluateParallelState(MPTasks); + for (const Task *T : depth_first(getRootTask())) { + // Skip tasks with no subtasks. + if (T->isSerial()) continue; + + for (const Spindle *S : T->spindles()) { + // Only conider spindles that might have tasks in parallel. + if (MPTasks.TaskList[S].empty()) continue; + + OS << "spindle@" << S->getEntry()->getName(); + OS << " may happen in parallel with:\n"; + for (const Task *MPT : MPTasks.TaskList[S]) + OS << " task@" << MPT->getEntry()->getName() << "\n"; + } + } + } +} + +AnalysisKey TaskAnalysis::Key; + +TaskInfo TaskAnalysis::run(Function &F, FunctionAnalysisManager &AM) { + // FIXME: Currently we create a TaskInfo from scratch for every function. + // This may prove to be too wasteful due to deallocating and re-allocating + // memory each time for the underlying map and vector datastructures. At some + // point it may prove worthwhile to use a freelist and recycle TaskInfo + // objects. I don't want to add that kind of complexity until the scope of + // the problem is better understood. + TaskInfo TI; + TI.analyze(F, AM.getResult(F)); + return TI; +} + +PreservedAnalyses TaskPrinterPass::run(Function &F, + FunctionAnalysisManager &AM) { + AM.getResult(F).print(OS); + return PreservedAnalyses::all(); +} + +void llvm::printTask(Task &T, raw_ostream &OS, const std::string &Banner) { + + if (forcePrintModuleIR()) { + // handling -print-module-scope + OS << Banner << " (task: "; + T.getEntry()->printAsOperand(OS, false); + OS << ")\n"; + + // printing whole module + OS << *T.getEntry()->getModule(); + return; + } + + OS << Banner; + + for (auto *S : T.spindles()) { + if (T.getEntrySpindle() == S) + OS << "entry spindle: "; + else + OS << "spindle: "; + + for (auto *Block : S->blocks()) + if (Block) + Block->print(OS); + else + OS << "Printing block"; + } +} + +void Task::verify(const TaskInfo *TI, const BasicBlock *Entry, + const DominatorTree &DT) const { + // Scan the blocks and spindles in this task and check that TaskInfo stores + // the correct information for them. + SmallPtrSet DetachedBlocks; + for (Spindle *S : spindles()) { + assert(TI->getTaskFor(S) == this && + "TaskInfo associates spindle with different task"); + for (BasicBlock *B : S->blocks()) { + assert(encloses(B) && + "Task spindle contains a block not enclosed by task"); + assert(DT.dominates(Entry, B) && + "Task entry does not dominate all task blocks"); + assert(TI->getSpindleFor(B) == S && + "TaskInfo associates block with different spindle"); + + if (DetachInst *DI = dyn_cast(B->getTerminator())) { + assert(TI->isTaskEntry(DI->getDetached()) && + "Detached block is not a task entry"); + // Record all blocks found to be detached by this task. + DetachedBlocks.insert(DI->getDetached()); + } + } + } + + // Verify that the same number of detached blocks and subtasks are found. + assert(DetachedBlocks.size() == getSubTasks().size() && + "Mismatch found between detached blocks and subtasks"); + + for (Task *T : getSubTasks()) { + // Check the entry of this subtask and its predecessor. + BasicBlock *TEntry = T->getEntry(); + assert(DetachedBlocks.count(TEntry) && + "Subtask entry not among set of detached blocks"); +#ifndef NDEBUG + BasicBlock *TPred = TEntry->getSinglePredecessor(); + assert(TPred && "Task entry does not have a single predecessors"); + + // Check the successors of the detach instruction that created this task. + DetachInst *DI = dyn_cast(TPred->getTerminator()); + assert(DI && "Task predecessor is not terminated by a detach"); + assert(DI->getDetached() == TEntry && + "Task entry is not a detached successor"); + assert(!DT.dominates(TEntry, DI->getContinue()) && + "Task entry dominates continuation of task."); + assert((!DI->hasUnwindDest() || + !DT.dominates(TEntry, DI->getUnwindDest())) && + "Task entry dominates unwind destination of detach"); + + // Check that detach edge dominates all blocks in subtask. + SmallVector TaskBlocks; + T->getDominatedBlocks(TaskBlocks); + BasicBlockEdge DetachEdge(TPred, TEntry); + for (BasicBlock *B : TaskBlocks) + assert(DT.dominates(DetachEdge, B) && + "Detach edge does not dominate all blocks in task"); +#endif + // Recursively verify the subtask. + T->verify(TI, TEntry, DT); + } +} + +void TaskInfo::verify(const DominatorTree &DT) const { + assert(RootTask && "No root task found"); + assert(RootTask->getEntry() == DT.getRoot() && + "Root task not rooted at dominator tree root"); + // Test the set of blocks extracted by getBlocks(), which uses the Task's + // associated dominator tree. + SmallVector TaskBlocks; + RootTask->getDominatedBlocks(TaskBlocks); +#ifndef NDEBUG + for (BasicBlock *B : TaskBlocks) { + Spindle *S = getSpindleFor(B); + assert(S && "TaskInfo does not associate this block with a spindle"); + assert(getTaskFor(S) && + "TaskInfo does not associate a task with this spindle"); + } +#endif + RootTask->verify(this, DT.getRoot(), DT); +} + +//===----------------------------------------------------------------------===// +// TaskInfo implementation +// + +TaskInfoWrapperPass::TaskInfoWrapperPass() : FunctionPass(ID) { + initializeTaskInfoWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +char TaskInfoWrapperPass::ID = 0; +INITIALIZE_PASS_BEGIN(TaskInfoWrapperPass, "tasks", "Tapir Task Information", + true, true) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(TaskInfoWrapperPass, "tasks", "Tapir Task Information", + true, true) + +bool TaskInfoWrapperPass::runOnFunction(Function &F) { + releaseMemory(); + TI.analyze(F, getAnalysis().getDomTree()); + return false; +} + +void TaskInfoWrapperPass::verifyAnalysis() const { + // TaskInfoWrapperPass is a FunctionPass, but verifying every task in the + // function each time verifyAnalysis is called is very expensive. The + // -verify-task-info option can enable this. + if (VerifyTaskInfo) { + auto &DT = getAnalysis().getDomTree(); + TI.verify(DT); + } +} + +void TaskInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequiredTransitive(); +} + +void TaskInfoWrapperPass::print(raw_ostream &OS, const Module *) const { + TI.print(OS); +} + +PreservedAnalyses TaskVerifierPass::run(Function &F, + FunctionAnalysisManager &AM) { + TaskInfo &TI = AM.getResult(F); + auto &DT = AM.getResult(F); + TI.verify(DT); + return PreservedAnalyses::all(); +} + +//===----------------------------------------------------------------------===// +// Associated analysis routines + +/// Examine a given loop to determine if it is structurally a Tapir loop. +/// Returns the Task that encodes the loop body if so, or nullptr if not. +Task *llvm::getTaskIfTapirLoopStructure(const Loop *L, TaskInfo *TI) { + if (!L || !TI) + return nullptr; + + const BasicBlock *Header = L->getHeader(); + const BasicBlock *Latch = L->getLoopLatch(); + + LLVM_DEBUG(dbgs() << "Analyzing loop: " << *L); + + // Header must be terminated by a detach. + const DetachInst *DI = dyn_cast(Header->getTerminator()); + if (!DI) { + LLVM_DEBUG(dbgs() << "Loop header does not detach.\n"); + return nullptr; + } + + // Loop must have a unique latch. + if (!Latch) { + LLVM_DEBUG(dbgs() << "Loop does not have a unique latch.\n"); + return nullptr; + } + + // The loop latch must be the continuation of the detach in the header. + if (Latch != DI->getContinue()) { + LLVM_DEBUG(dbgs() << + "Continuation of detach in header is not the latch.\n"); + return nullptr; + } + + Task *T = TI->getTaskFor(DI->getDetached()); + assert(T && "Detached block not mapped to a task."); + assert(T->getDetach() == DI && "Task mapped to unexpected detach."); + + // All predecessors of the latch other than the header must be in the task. + for (const BasicBlock *Pred : predecessors(Latch)) { + if (Header == Pred) continue; + if (!T->encloses(Pred)) { + LLVM_DEBUG(dbgs() << "Latch has predecessor outside of spawned body.\n"); + return nullptr; + } + } + + // For each exit from the latch, any predecessor of that exit inside the loop + // must be the header or the latch. + for (const BasicBlock *Exit : successors(Latch)) { + for (const BasicBlock *ExitPred : predecessors(Exit)) { + if (!L->contains(ExitPred)) continue; + if (Header != ExitPred && Latch != ExitPred) { + LLVM_DEBUG(dbgs() << + "Loop branches to an exit of the latch from a block " << + "other than the header or latch.\n"); + return nullptr; + } + } + } + +#ifndef NDEBUG + // EXPENSIVE CHECK for verification. + // + // The blocks in this loop can only be the header, the latch, or a block + // contained in the task. + for (const BasicBlock *BB : L->blocks()) { + if (BB == Header) continue; + if (BB == Latch) continue; + assert(T->encloses(BB) && + "Loop contains block not enclosed by detached task.\n"); + } +#endif + + return T; +} diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index 5b9a7b0f3322051..96e42f9056d7fb4 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -18,6 +18,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Tapir/TapirTargetIDs.h" using namespace llvm; static cl::opt ClVectorLibrary( @@ -42,6 +43,22 @@ static cl::opt ClVectorLibrary( clEnumValN(TargetLibraryInfoImpl::AMDLIBM, "AMDLIBM", "AMD vector math library"))); +static cl::opt ClTapirTarget( + "tapir-target", cl::Hidden, cl::desc("Target runtime for Tapir"), + cl::init(TapirTargetID::OpenCilk), + cl::values(clEnumValN(TapirTargetID::None, + "none", "None"), + clEnumValN(TapirTargetID::Serial, + "serial", "Serial code"), + clEnumValN(TapirTargetID::Cheetah, + "cheetah", "Cheetah"), + clEnumValN(TapirTargetID::OpenCilk, + "opencilk", "OpenCilk"), + clEnumValN(TapirTargetID::Lambda, + "lambda", "Lambda"), + clEnumValN(TapirTargetID::OMPTask, + "omptask", "OMPTask"))); + StringLiteral const TargetLibraryInfoImpl::StandardNames[LibFunc::NumLibFuncs] = { #define TLI_DEFINE_STRING @@ -90,6 +107,17 @@ static const FuncProtoTy Signatures[] = { static_assert(sizeof Signatures / sizeof *Signatures == LibFunc::NumLibFuncs, "Missing library function signatures"); +TapirTargetOptions *TapirTargetOptions::clone() const { + TapirTargetOptions *New = nullptr; + switch (getKind()) { + default: + llvm_unreachable("Unhandled TapirTargetOption."); + case TTO_OpenCilk: + New = cast(this)->cloneImpl(); + } + return New; +} + static bool hasSinCosPiStret(const Triple &T) { // Only Darwin variants have _stret versions of combined trig functions. if (!T.isOSDarwin()) @@ -889,6 +917,9 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_memrchr); TLI.addVectorizableFunctionsFromVecLib(ClVectorLibrary, T); + + TLI.setTapirTarget(ClTapirTarget); + TLI.addTapirTargetLibraryFunctions(ClTapirTarget); } /// Initialize the set of available library functions based on the specified @@ -918,10 +949,13 @@ TargetLibraryInfoImpl::TargetLibraryInfoImpl(const TargetLibraryInfoImpl &TLI) ShouldExtI32Return(TLI.ShouldExtI32Return), ShouldSignExtI32Param(TLI.ShouldSignExtI32Param), ShouldSignExtI32Return(TLI.ShouldSignExtI32Return), - SizeOfInt(TLI.SizeOfInt) { + SizeOfInt(TLI.SizeOfInt), TapirTarget(TLI.TapirTarget) { + if (TLI.TTOptions) + TTOptions = std::unique_ptr(TLI.TTOptions->clone()); memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray)); VectorDescs = TLI.VectorDescs; ScalarDescs = TLI.ScalarDescs; + TapirTargetFuncs = TLI.TapirTargetFuncs; } TargetLibraryInfoImpl::TargetLibraryInfoImpl(TargetLibraryInfoImpl &&TLI) @@ -930,11 +964,13 @@ TargetLibraryInfoImpl::TargetLibraryInfoImpl(TargetLibraryInfoImpl &&TLI) ShouldExtI32Return(TLI.ShouldExtI32Return), ShouldSignExtI32Param(TLI.ShouldSignExtI32Param), ShouldSignExtI32Return(TLI.ShouldSignExtI32Return), - SizeOfInt(TLI.SizeOfInt) { + SizeOfInt(TLI.SizeOfInt), TapirTarget(TLI.TapirTarget), + TTOptions(std::move(TLI.TTOptions)) { std::move(std::begin(TLI.AvailableArray), std::end(TLI.AvailableArray), AvailableArray); VectorDescs = TLI.VectorDescs; ScalarDescs = TLI.ScalarDescs; + TapirTargetFuncs = TLI.TapirTargetFuncs; } TargetLibraryInfoImpl &TargetLibraryInfoImpl::operator=(const TargetLibraryInfoImpl &TLI) { @@ -944,6 +980,9 @@ TargetLibraryInfoImpl &TargetLibraryInfoImpl::operator=(const TargetLibraryInfoI ShouldSignExtI32Param = TLI.ShouldSignExtI32Param; ShouldSignExtI32Return = TLI.ShouldSignExtI32Return; SizeOfInt = TLI.SizeOfInt; + TapirTarget = TLI.TapirTarget; + if (TLI.TTOptions) + TTOptions = std::unique_ptr(TLI.TTOptions->clone()); memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray)); return *this; } @@ -955,6 +994,8 @@ TargetLibraryInfoImpl &TargetLibraryInfoImpl::operator=(TargetLibraryInfoImpl && ShouldSignExtI32Param = TLI.ShouldSignExtI32Param; ShouldSignExtI32Return = TLI.ShouldSignExtI32Return; SizeOfInt = TLI.SizeOfInt; + TapirTarget = TLI.TapirTarget; + TTOptions = std::move(TLI.TTOptions); std::move(std::begin(TLI.AvailableArray), std::end(TLI.AvailableArray), AvailableArray); return *this; @@ -1328,6 +1369,58 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( } } +void TargetLibraryInfoImpl::addTapirTargetLibraryFunctions( + TapirTargetID TargetID) { + switch (TargetID) { + case TapirTargetID::OpenCilk: { + const StringLiteral TTFuncs[] = { + #define TLI_DEFINE_CILK_LIBS + #include "llvm/Analysis/TapirTargetFuncs.def" + }; + TapirTargetFuncs.insert(TapirTargetFuncs.end(), std::begin(TTFuncs), + std::end(TTFuncs)); + break; + } + case TapirTargetID::None: + case TapirTargetID::Serial: + case TapirTargetID::Cheetah: + case TapirTargetID::Lambda: + case TapirTargetID::OMPTask: + case TapirTargetID::Qthreads: + case TapirTargetID::Last_TapirTargetID: + break; + } + + // Ensure that the collected Tapir-target functions are in sorted order. + llvm::sort(TapirTargetFuncs); +} + +bool TargetLibraryInfoImpl::isTapirTargetLibFunc(StringRef funcName) const { + funcName = sanitizeFunctionName(funcName); + if (funcName.empty()) + return false; + + const auto Start = TapirTargetFuncs.begin(); + const auto End = TapirTargetFuncs.end(); + const auto I = std::lower_bound(Start, End, funcName); + if (I != End && *I == funcName) + return true; + return false; +} + +bool TargetLibraryInfoImpl::isTapirTargetLibFunc( + const Function &FDecl) const { + // Intrinsics don't overlap w/libcalls; if our module has a large number of + // intrinsics, this ends up being an interesting compile time win since we + // avoid string normalization and comparison. + if (FDecl.isIntrinsic()) return false; + + // TODO: Check the function prototype of the Tapir-target library function to + // ensure a match. This change may require building more detailed knowledge + // of these functions into TargetLibraryInfo. + return isTapirTargetLibFunc(FDecl.getName()); +} + bool TargetLibraryInfoImpl::isFunctionVectorizable(StringRef funcName) const { funcName = sanitizeFunctionName(funcName); if (funcName.empty()) diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 6a0fa98089ba53f..e2c63bb0c5b6ae3 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -396,6 +396,11 @@ void TargetTransformInfo::getPeelingPreferences(Loop *L, ScalarEvolution &SE, return TTIImpl->getPeelingPreferences(L, SE, PP); } +void TargetTransformInfo::getStripMiningPreferences( + Loop *L, ScalarEvolution &SE, StripMiningPreferences &SMP) const { + return TTIImpl->getStripMiningPreferences(L, SE, SMP); +} + bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const { return TTIImpl->isLegalAddImmediate(Imm); } diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 4b77c0046cc70f5..5b8f0c12aeb3f09 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -484,6 +484,14 @@ static bool isEphemeralValueOf(const Instruction *I, const Value *E) { // Is this an intrinsic that cannot be speculated but also cannot trap? bool llvm::isAssumeLikeIntrinsic(const Instruction *I) { + // Check for invokes of detached.rethrow, taskframe.resume, or sync.unwind. + if (const InvokeInst *II = dyn_cast(I)) + if (const Function *Called = II->getCalledFunction()) + if (Intrinsic::detached_rethrow == Called->getIntrinsicID() || + Intrinsic::taskframe_resume == Called->getIntrinsicID() || + Intrinsic::sync_unwind == Called->getIntrinsicID()) + return true; + if (const IntrinsicInst *CI = dyn_cast(I)) return CI->isAssumeLikeIntrinsic(); @@ -6918,6 +6926,9 @@ bool llvm::isSafeToSpeculativelyExecuteWithOpcode( case Instruction::CatchRet: case Instruction::CleanupPad: case Instruction::CleanupRet: + case Instruction::Detach: + case Instruction::Reattach: + case Instruction::Sync: return false; // Misc instructions which have effects } } diff --git a/llvm/lib/Analysis/WorkSpanAnalysis.cpp b/llvm/lib/Analysis/WorkSpanAnalysis.cpp new file mode 100644 index 000000000000000..0f0b66147c3be8b --- /dev/null +++ b/llvm/lib/Analysis/WorkSpanAnalysis.cpp @@ -0,0 +1,118 @@ +//===- WorkSpanAnalysis.cpp - Analysis to estimate work and span ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements an analysis pass to estimate the work and span of the +// program. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/WorkSpanAnalysis.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "work-span" + +// Get a constant trip count for the given loop. +unsigned llvm::getConstTripCount(const Loop *L, ScalarEvolution &SE) { + int64_t ConstTripCount = 0; + // If there are multiple exiting blocks but one of them is the latch, use + // the latch for the trip count estimation. Otherwise insist on a single + // exiting block for the trip count estimation. + BasicBlock *ExitingBlock = L->getLoopLatch(); + if (!ExitingBlock || !L->isLoopExiting(ExitingBlock)) + ExitingBlock = L->getExitingBlock(); + if (ExitingBlock) + ConstTripCount = SE.getSmallConstantTripCount(L, ExitingBlock); + return ConstTripCount; +} + +/// Recursive helper routine to estimate the amount of work in a loop. +static void estimateLoopCostHelper(const Loop *L, CodeMetrics &Metrics, + WSCost &LoopCost, LoopInfo *LI, + ScalarEvolution *SE) { + if (LoopCost.UnknownCost) + return; + + // TODO: Handle control flow within the loop intelligently, using + // BlockFrequencyInfo. + for (Loop *SubL : *L) { + WSCost SubLoopCost; + estimateLoopCostHelper(SubL, Metrics, SubLoopCost, LI, SE); + // Quit early if the size of this subloop is already too big. + if (InstructionCost::getMax() == SubLoopCost.Work) + LoopCost.Work = InstructionCost::getMax(); + + // Find a constant trip count if available + int64_t ConstTripCount = SE ? getConstTripCount(SubL, *SE) : 0; + // TODO: Use a more precise analysis to account for non-constant trip + // counts. + if (!ConstTripCount) { + LoopCost.UnknownCost = true; + // If we cannot compute a constant trip count, assume this subloop + // executes at least once. + ConstTripCount = 1; + } + + // Check if the total size of this subloop is huge. + if (InstructionCost::getMax() / ConstTripCount > SubLoopCost.Work) + LoopCost.Work = InstructionCost::getMax(); + + // Check if this subloop suffices to make loop L huge. + if (InstructionCost::getMax() - LoopCost.Work < + (SubLoopCost.Work * ConstTripCount)) + LoopCost.Work = InstructionCost::getMax(); + + // Add in the size of this subloop. + LoopCost.Work += (SubLoopCost.Work * ConstTripCount); + } + + // After looking at all subloops, if we've concluded we have a huge loop size, + // return early. + if (InstructionCost::getMax() == LoopCost.Work) + return; + + for (BasicBlock *BB : L->blocks()) + if (LI->getLoopFor(BB) == L) { + // Check if this BB suffices to make loop L huge. + if (InstructionCost::getMax() - LoopCost.Work < Metrics.NumBBInsts[BB]) { + LoopCost.Work = InstructionCost::getMax(); + return; + } + LoopCost.Work += Metrics.NumBBInsts[BB]; + } +} + +void llvm::estimateLoopCost(WSCost &LoopCost, const Loop *L, LoopInfo *LI, + ScalarEvolution *SE, const TargetTransformInfo &TTI, + TargetLibraryInfo *TLI, + const SmallPtrSetImpl &EphValues) { + // TODO: Use more precise analysis to estimate the work in each call. + // TODO: Use vectorizability to enhance cost analysis. + + // Gather code metrics for all basic blocks in the loop. + for (BasicBlock *BB : L->blocks()) + LoopCost.Metrics.analyzeBasicBlock(BB, TTI, EphValues, + /*PrepareForLTO*/ false, TLI); + + estimateLoopCostHelper(L, LoopCost.Metrics, LoopCost, LI, SE); +} diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 7d7fe19568e8a65..6ac11fd4077be78 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -890,6 +890,9 @@ lltok::Kind LLLexer::LexIdentifier() { INSTKEYWORD(resume, Resume); INSTKEYWORD(unreachable, Unreachable); INSTKEYWORD(callbr, CallBr); + INSTKEYWORD(detach, Detach); + INSTKEYWORD(reattach, Reattach); + INSTKEYWORD(sync, Sync); INSTKEYWORD(alloca, Alloca); INSTKEYWORD(load, Load); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index a886f6e3a4b93df..7e3e915733c425d 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -6878,6 +6878,12 @@ int LLParser::parseInstruction(Instruction *&Inst, BasicBlock *BB, return parseCleanupPad(Inst, PFS); case lltok::kw_callbr: return parseCallBr(Inst, PFS); + case lltok::kw_detach: + return parseDetach(Inst, PFS); + case lltok::kw_reattach: + return parseReattach(Inst, PFS); + case lltok::kw_sync: + return parseSync(Inst, PFS); // Unary Operators. case lltok::kw_fneg: { FastMathFlags FMF = EatFastMathFlagsIfPresent(); @@ -7170,6 +7176,98 @@ bool LLParser::parseBr(Instruction *&Inst, PerFunctionState &PFS) { return false; } +/// parseDetach +/// ::= 'detach' within SyncRegion ',' TypeAndValue ',' TypeAndValue +/// ::= 'detach' within SyncRegion ',' TypeAndValue ',' TypeAndValue \ +/// unwind TypeAndValue +bool LLParser::parseDetach(Instruction *&Inst, PerFunctionState &PFS) { + LocTy Loc, Loc2; + Value *SR; + BasicBlock *Op1, *Op2; + + if (parseToken(lltok::kw_within, "expected 'within' after detach")) + return true; + + if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar && + Lex.getKind() != lltok::LocalVarID) + return tokError("expected scope value for detach"); + + if (parseValue(Type::getTokenTy(Context), SR, PFS)) + return true; + + if (parseToken(lltok::comma, "expected ',' after detach scope")) + return true; + + if (parseTypeAndBasicBlock(Op1, Loc, PFS) || + parseToken(lltok::comma, "expected ',' after detached destination") || + parseTypeAndBasicBlock(Op2, Loc2, PFS)) + return true; + + LocTy Loc3; + BasicBlock *UnwindBB = nullptr; + if (EatIfPresent(lltok::kw_unwind)) { + if (parseTypeAndBasicBlock(UnwindBB, Loc3, PFS)) + return true; + Inst = DetachInst::Create(Op1, Op2, UnwindBB, SR); + } else + Inst = DetachInst::Create(Op1, Op2, SR); + return false; +} + +/// parseReattach +/// ::= 'reattach' within SyncRegion ',' TypeAndValue +bool LLParser::parseReattach(Instruction *&Inst, PerFunctionState &PFS) { + LocTy Loc; + Value *SR; + BasicBlock *Op; + + if (parseToken(lltok::kw_within, "expected 'within' after reatach")) + return true; + + if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar && + Lex.getKind() != lltok::LocalVarID) + return tokError("expected scope value for reattach"); + + if (parseValue(Type::getTokenTy(Context), SR, PFS)) + return true; + + if (parseToken(lltok::comma, "expected ',' after reattach scope")) + return true; + + if (parseTypeAndBasicBlock(Op, Loc, PFS)) + return true; + + Inst = ReattachInst::Create(Op, SR); + return false; +} + +/// parseSync +/// ::= 'sync' within SyncRegion ',' TypeAndValue +bool LLParser::parseSync(Instruction *&Inst, PerFunctionState &PFS) { + LocTy Loc; + Value *SR; + BasicBlock *Op; + + if (parseToken(lltok::kw_within, "expected 'within' after sync")) + return true; + + if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar && + Lex.getKind() != lltok::LocalVarID) + return tokError("expected scope value for reattach"); + + if (parseValue(Type::getTokenTy(Context), SR, PFS)) + return true; + + if (parseToken(lltok::comma, "expected ',' after scope in sync")) + return true; + + if (parseTypeAndBasicBlock(Op, Loc, PFS)) + return true; + + Inst = SyncInst::Create(Op, SR); + return false; +} + /// parseSwitch /// Instruction /// ::= 'switch' TypeAndValue ',' TypeAndValue '[' JumpTable ']' diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 84d624f6cf8fa25..92411913c43eee1 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2127,12 +2127,16 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::SafeStack; case bitc::ATTR_KIND_SHADOWCALLSTACK: return Attribute::ShadowCallStack; + case bitc::ATTR_KIND_STEALABLE: + return Attribute::Stealable; case bitc::ATTR_KIND_STRICT_FP: return Attribute::StrictFP; case bitc::ATTR_KIND_STRUCT_RET: return Attribute::StructRet; case bitc::ATTR_KIND_SANITIZE_ADDRESS: return Attribute::SanitizeAddress; + case bitc::ATTR_KIND_SANITIZE_CILK: + return Attribute::SanitizeCilk; case bitc::ATTR_KIND_SANITIZE_HWADDRESS: return Attribute::SanitizeHWAddress; case bitc::ATTR_KIND_SANITIZE_THREAD: @@ -5969,6 +5973,73 @@ Error BitcodeReader::parseFunctionBody(Function *F) { I = new UnreachableInst(Context); InstructionList.push_back(I); break; + case bitc::FUNC_CODE_INST_DETACH: { // DETACH: [bb#, bb#, [bb#,] val] + if (Record.size() != 3 && Record.size() != 4) + return error("Invalid record"); + BasicBlock *Detached = getBasicBlock(Record[0]); + if (!Detached) + return error("Invalid record"); + + BasicBlock *Continue = getBasicBlock(Record[1]); + if (!Continue) + return error("Invalid record"); + + unsigned SREntry = 2; + BasicBlock *Unwind = nullptr; + if (Record.size() == 4) { + Unwind = getBasicBlock(Record[SREntry++]); + if (!Unwind) + return error("Invalid record"); + } + + Type *TokenTy = Type::getTokenTy(Context); + Value *SyncRegion = getValue(Record, SREntry, NextValueNo, TokenTy, + getVirtualTypeID(TokenTy), CurBB); + if (!SyncRegion) + return error("Invalid record"); + + if (Unwind) + I = DetachInst::Create(Detached, Continue, Unwind, SyncRegion); + else + I = DetachInst::Create(Detached, Continue, SyncRegion); + InstructionList.push_back(I); + break; + } + case bitc::FUNC_CODE_INST_REATTACH: { // REATTACH: [bb#, val] + if (Record.size() != 2) + return error("Invalid record"); + + BasicBlock *DetachContinue = getBasicBlock(Record[0]); + if (!DetachContinue) + return error("Invalid record"); + + Type *TokenTy = Type::getTokenTy(Context); + Value *SyncRegion = getValue(Record, 1, NextValueNo, TokenTy, + getVirtualTypeID(TokenTy), CurBB); + if (!SyncRegion) + return error("Invalid record"); + + I = ReattachInst::Create(DetachContinue, SyncRegion); + InstructionList.push_back(I); + break; + } + case bitc::FUNC_CODE_INST_SYNC: { // Sync: [bb#, val] + if (Record.size() != 2) + return error("Invalid record"); + BasicBlock *Continue = getBasicBlock(Record[0]); + if (!Continue) + return error("Invalid record"); + + Type *TokenTy = Type::getTokenTy(Context); + Value *SyncRegion = getValue(Record, 1, NextValueNo, TokenTy, + getVirtualTypeID(TokenTy), CurBB); + if (!SyncRegion) + return error("Invalid record"); + + I = SyncInst::Create(Continue, SyncRegion); + InstructionList.push_back(I); + break; + } case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...] if (Record.empty()) return error("Invalid phi record"); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 324dcbca8137ed5..85f9d1556c26c7b 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -829,12 +829,16 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_SAFESTACK; case Attribute::ShadowCallStack: return bitc::ATTR_KIND_SHADOWCALLSTACK; + case Attribute::Stealable: + return bitc::ATTR_KIND_STEALABLE; case Attribute::StrictFP: return bitc::ATTR_KIND_STRICT_FP; case Attribute::StructRet: return bitc::ATTR_KIND_STRUCT_RET; case Attribute::SanitizeAddress: return bitc::ATTR_KIND_SANITIZE_ADDRESS; + case Attribute::SanitizeCilk: + return bitc::ATTR_KIND_SANITIZE_CILK; case Attribute::SanitizeHWAddress: return bitc::ATTR_KIND_SANITIZE_HWADDRESS; case Attribute::SanitizeThread: @@ -3226,6 +3230,33 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, Code = bitc::FUNC_CODE_INST_UNREACHABLE; AbbrevToUse = FUNCTION_INST_UNREACHABLE_ABBREV; break; + case Instruction::Detach: + { + Code = bitc::FUNC_CODE_INST_DETACH; + const DetachInst &DI = cast(I); + Vals.push_back(VE.getValueID(DI.getDetached())); + Vals.push_back(VE.getValueID(DI.getContinue())); + if (DI.hasUnwindDest()) + Vals.push_back(VE.getValueID(DI.getUnwindDest())); + pushValue(DI.getSyncRegion(), InstID, Vals); + } + break; + case Instruction::Reattach: + { + Code = bitc::FUNC_CODE_INST_REATTACH; + const ReattachInst &RI = cast(I); + Vals.push_back(VE.getValueID(RI.getSuccessor(0))); + pushValue(RI.getSyncRegion(), InstID, Vals); + } + break; + case Instruction::Sync: + { + Code = bitc::FUNC_CODE_INST_SYNC; + const SyncInst &SI = cast(I); + Vals.push_back(VE.getValueID(SI.getSuccessor(0))); + pushValue(SI.getSyncRegion(), InstID, Vals); + } + break; case Instruction::PHI: { const PHINode &PN = cast(I); diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index f1607f85c5b3198..9d81cc20066c0c5 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -228,6 +228,7 @@ add_llvm_component_library(LLVMCodeGen SwitchLoweringUtils.cpp TailDuplication.cpp TailDuplicator.cpp + TapirCleanup.cpp TargetFrameLoweringImpl.cpp TargetInstrInfo.cpp TargetLoweringBase.cpp diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 31fa4c105cef80b..9330b7b37e990e8 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -131,6 +131,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeStackSlotColoringPass(Registry); initializeStripDebugMachineModulePass(Registry); initializeTailDuplicatePass(Registry); + initializeTapirCleanupPass(Registry); initializeTargetPassConfigPass(Registry); initializeTwoAddressInstructionLegacyPassPass(Registry); initializeTypePromotionLegacyPass(Registry); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index eb010afd41b6b71..c7655d47435e835 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1351,6 +1351,62 @@ bool IRTranslator::translateIndirectBr(const User &U, return true; } +bool IRTranslator::translateDetach(const User &U, + MachineIRBuilder &MIRBuilder) { + const DetachInst &DetInst = cast(U); + + // Lowering of Tapir instructions should have happened already. At this + // stage, treat Detach like an unconditional branch to the detached successor. + const BasicBlock &DetTgt = *cast(DetInst.getDetached()); + MachineBasicBlock &TgtBB = getMBB(DetTgt); + MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + + // If the detached successor is the layout successor, fallthrough. + if (!CurBB.isLayoutSuccessor(&TgtBB)) + MIRBuilder.buildBr(TgtBB); + + // Link detached successor. + CurBB.addSuccessor(&getMBB(*cast(DetInst.getDetached()))); + return true; +} + +bool IRTranslator::translateReattach(const User &U, + MachineIRBuilder &MIRBuilder) { + const ReattachInst &ReatInst = cast(U); + + // Lowering of Tapir instructions should have happened already. At this + // stage, treat Reattach like an unconditional branch to its successor. + const BasicBlock &ReatTgt = *cast(ReatInst.getSuccessor(0)); + MachineBasicBlock &TgtBB = getMBB(ReatTgt); + MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + + // If the reattach successor is the layout successor, fallthrough. + if (!CurBB.isLayoutSuccessor(&TgtBB)) + MIRBuilder.buildBr(TgtBB); + + // Link the Reattach instruction's successor. + CurBB.addSuccessor(&getMBB(*cast(ReatInst.getSuccessor(0)))); + return true; +} + +bool IRTranslator::translateSync(const User &U, MachineIRBuilder &MIRBuilder) { + const SyncInst &SInst = cast(U); + + // Lowering of Tapir instructions should have happened already. At this + // stage, treat Sync like an unconditional branch to its successor. + const BasicBlock &STgt = *cast(SInst.getSuccessor(0)); + MachineBasicBlock &TgtBB = getMBB(STgt); + MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + + // If the sync successor is the layout successor, fallthrough. + if (!CurBB.isLayoutSuccessor(&TgtBB)) + MIRBuilder.buildBr(TgtBB); + + // Link the Sync instruction's successor. + CurBB.addSuccessor(&getMBB(*cast(SInst.getSuccessor(0)))); + return true; +} + static bool isSwiftError(const Value *V) { if (auto Arg = dyn_cast(V)) return Arg->hasSwiftErrorAttr(); @@ -2600,6 +2656,24 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return translateVectorDeinterleave2Intrinsic(CI, MIRBuilder); } + case Intrinsic::syncregion_start: + // Lower the starting point of a Tapir sync region to a no-op. + case Intrinsic::taskframe_load_guard: + // Discard any taskframe.load.guards. + case Intrinsic::taskframe_create: + // Discard any taskframe.creates. + case Intrinsic::taskframe_use: + // Discard any taskframe.uses. + case Intrinsic::taskframe_end: + // Discard any taskframe.ends. + case Intrinsic::sync_unwind: + // Discard any sync.unwinds. + case Intrinsic::tapir_runtime_start: + // Discard any tapir.runtime.starts. + case Intrinsic::tapir_runtime_end: + // Discard any tapir.runtime.ends. + return true; + #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \ case Intrinsic::INTRINSIC: #include "llvm/IR/ConstrainedOps.def" diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index 0a6ce6a1358170d..0c039e795c7d69b 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -4062,6 +4062,18 @@ template <> class SSAUpdaterTraits { return Num; } + /// BlockReattaches - Always returns false, because machine basic blocks + /// should never contain Tapir instructions. + static bool BlockReattaches(LDVSSABlock *BB, LDVSSAUpdater *Updater) { + return false; + } + + /// BlockDetaches - Always returns false, because machine basic blocks + /// should never contain Tapir instructions. + static bool BlockDetaches(LDVSSABlock *BB, LDVSSAUpdater *Updater) { + return false; + } + /// CreateEmptyPHI - Create a (representation of a) PHI in the given block. /// SSAUpdater will populate it with information about incoming values. The /// value number of this PHI is whatever the machine value number problem diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index a5d6a40392d0cb7..e3e4d43b1ae0e13 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -489,6 +489,7 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF, MF.setAlignment(YamlMF.Alignment.valueOrOne()); MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice); + MF.setExposesOpaqueReturnsTwice(YamlMF.ExposesOpaqueReturnsTwice); MF.setHasWinCFI(YamlMF.HasWinCFI); MF.setCallsEHReturn(YamlMF.CallsEHReturn); diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index 48c3e0d7a97e61b..5f17cee8dd602f0 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -198,6 +198,7 @@ void MIRPrinter::print(const MachineFunction &MF) { YamlMF.Name = MF.getName(); YamlMF.Alignment = MF.getAlignment(); YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice(); + YamlMF.ExposesOpaqueReturnsTwice = MF.exposesOpaqueReturnsTwice(); YamlMF.HasWinCFI = MF.hasWinCFI(); YamlMF.CallsEHReturn = MF.callsEHReturn(); diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 4c864ca15ccc52c..23bfdcce2dcb3c4 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -786,11 +786,12 @@ bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) { // Blocks with single successors don't create additional fallthrough // opportunities. Don't duplicate them. TODO: When conditional exits are // analyzable, allow them to be duplicated. - bool IsSimple = TailDup.isSimpleBB(BB); - if (BB->succ_size() == 1) return false; - return TailDup.shouldTailDuplicate(IsSimple, *BB); + + BlockDesc Desc = TailDup.getBlockDesc(BB); + + return TailDup.shouldTailDuplicate(Desc, *BB); } /// Compare 2 BlockFrequency's with a small penalty for \p A. @@ -3187,7 +3188,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock( function_ref(RemovalCallback); SmallVector DuplicatedPreds; - bool IsSimple = TailDup.isSimpleBB(BB); + BlockDesc Desc = TailDup.getBlockDesc(BB); SmallVector CandidatePreds; SmallVectorImpl *CandidatePtr = nullptr; if (F->getFunction().hasProfileData()) { @@ -3198,7 +3199,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock( if (CandidatePreds.size() < BB->pred_size()) CandidatePtr = &CandidatePreds; } - TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds, + TailDup.tailDuplicateAndUpdate(Desc, BB, LPred, &DuplicatedPreds, &RemovalCallbackRef, CandidatePtr); // Update UnscheduledPredecessors to reflect tail-duplication. diff --git a/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/llvm/lib/CodeGen/MachineSSAUpdater.cpp index 4cbb6ad3128bd9f..fee2fac2184ce62 100644 --- a/llvm/lib/CodeGen/MachineSSAUpdater.cpp +++ b/llvm/lib/CodeGen/MachineSSAUpdater.cpp @@ -317,6 +317,19 @@ class SSAUpdaterTraits { return NewDef->getOperand(0).getReg(); } + /// BlockReattaches - Always returns false, because machine basic blocks + /// should never contain Tapir instructions. + static bool BlockReattaches(MachineBasicBlock *BB, + MachineSSAUpdater *Updater) { + return false; + } + + /// BlockDetaches - Always returns false, because machine basic blocks + /// should never contain Tapir instructions. + static bool BlockDetaches(MachineBasicBlock *BB, MachineSSAUpdater *Updater) { + return false; + } + /// CreateEmptyPHI - Create a PHI instruction that defines a new register. /// Add it into the specified block and return the register. static Register CreateEmptyPHI(MachineBasicBlock *BB, unsigned NumPreds, @@ -362,6 +375,12 @@ class SSAUpdaterTraits { static Register GetPHIValue(MachineInstr *PHI) { return PHI->getOperand(0).getReg(); } + + static void MarkDetachedDef(unsigned Val, MachineBasicBlock *BB, + MachineSSAUpdater *Updater) { + return; + } + }; } // end namespace llvm diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index 4b3ff57fb478aee..3977022d8568f15 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -219,6 +219,8 @@ namespace { bool hasStoreBetween(MachineBasicBlock *From, MachineBasicBlock *To, MachineInstr &MI); + bool possiblyHasSetjmpBetween(MachineBasicBlock *From, + MachineBasicBlock *To, MachineInstr &MI); /// Postpone the splitting of the given critical /// edge (\p From, \p To). @@ -330,6 +332,91 @@ static bool blockPrologueInterferes(const MachineBasicBlock *BB, return false; } +// Helper function to check if MBB contains a terminator that might correspond +// with EH_SjLj_Setup. +static bool blockMayContainSetjmpSetup(const MachineBasicBlock *MBB, + const MachineBasicBlock *Succ) { + for (const MachineInstr &MI : MBB->terminators()) + // It seems hard to check for EH_SjLj_Setup directly, since that instruction + // seems to be target-dependent. Instead we simply check if the terminator + // has unmodeled side effects. + if (MI.hasUnmodeledSideEffects() && + llvm::any_of(MI.operands(), [&](const MachineOperand &Op) { + return Op.isMBB() && Op.getMBB() == Succ; + })) + return true; + return false; +} + +// possiblyHasSetjmpBetween - Check for setjmps along the path from block From +// to block To. +bool MachineSinking::possiblyHasSetjmpBetween(MachineBasicBlock *From, + MachineBasicBlock *To, + MachineInstr &MI) { + // Copies and other transient instructions are safe to move past setjmps. + if (MI.isCopyLike()) + return false; + + // If MI cannot store and it does not read any register operands (which might + // be spilled), then they are safe to move past setjmps. + if (!MI.mayStore() && + !llvm::any_of(MI.operands(), [&](const MachineOperand &Op) { + if (Op.isReg() && Op.getReg().isValid() && !Op.isDef()) { + LLVM_DEBUG(dbgs() + << "Reads valid register operand " << Op << "\n"); + return true; + } + return false; + })) + return false; + + // For now we examine just the predecessors of predecessors of To for possible + // setjmp-setup constructs. For example: + // + // Pred: + // ... + // EH_SjLj_Setup BB + // BB: + // = MOV 1 + // JMP To + // To: + // = PHI + // TEST + // CONDITIONAL_JMP + // + // Note that it is safe to move an instruction after the conditional jmp, but + // not into the body of To. At this time LLVM does not seem to generate more + // complex control-flow structures encoding setjmps. This code should be + // revisited if LLVM is able to generate more complex control-flow structures + // for setjmp. + for (MachineBasicBlock *BB : To->predecessors()) { + if (BB->hasAddressTaken() && PDT->dominates(To, BB)) { + // Since BB's address is taken, BB might be the desintation of a longjmp. + LLVM_DEBUG(dbgs() << "Checking predecessor " << *BB); + for (MachineBasicBlock *Pred : BB->predecessors()) { + if (PDT->dominates(To, Pred)) { + LLVM_DEBUG(dbgs() << "Checking predecessor of predecessor " << *Pred); + if (blockMayContainSetjmpSetup(Pred, BB)) { + // Pred might contain a setjmp with BB the destination of a + // corresponding longjmp. If BB contains an instruction that + // produces a definition, assume that definition is used to + // distinguish different returns from the setjmp, meaning its unsafe + // to sink the instruction past that definition. + for (MachineInstr &I : *BB) { + if (I.mayStore() || I.getNumDefs() > 0) { + LLVM_DEBUG(dbgs() << "Found definition in pred-pred block: " + << I << "\n"); + return true; + } + } + } + } + } + } + } + return false; +} + bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI, MachineBasicBlock *MBB) { if (!MI.isCopy()) @@ -1725,6 +1812,13 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore, TryBreak = true; } + // Don't sink instructions into successors of setjmps that may execute + // multiple times. + if (!TryBreak && possiblyHasSetjmpBetween(ParentBlock, SuccToSinkTo, MI)) { + LLVM_DEBUG(dbgs() << " *** NOTE: Possible setjmp setup found\n"); + TryBreak = true; + } + // Otherwise we are OK with sinking along a critical edge. if (!TryBreak) LLVM_DEBUG(dbgs() << "Sinking along critical edge.\n"); diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 043ea2019148746..d368e79930be722 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -231,6 +231,11 @@ namespace { void setUndefOnPrunedSubRegUses(LiveInterval &LI, Register Reg, LaneBitmask PrunedLanes); + /// Return true if the live interval from coalescing SrcLI and DstLI crosses + /// a basic-block edge that may be produced by a setjmp. + bool coalescedLiveIntervalMayCrossSetjmp(LiveInterval &SrcLI, + LiveInterval &DstLI); + /// Attempt to join intervals corresponding to SrcReg/DstReg, which are the /// src/dst of the copy instruction CopyMI. This returns true if the copy /// was successfully coalesced away. If it is not currently possible to @@ -1965,6 +1970,45 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI, LIS->shrinkToUses(&LI); } +// Helper function to check if MBB contains a terminator that might correspond +// with EH_SjLj_Setup. +static bool blockMayContainSetjmpSetup(const MachineBasicBlock *MBB, + const MachineBasicBlock *Succ) { + for (const MachineInstr &MI : MBB->terminators()) + // It seems hard to check for EH_SjLj_Setup directly, since that instruction + // seems to be target-dependent. Instead we simply check if the terminator + // has unmodeled side effects. + if (MI.hasUnmodeledSideEffects() && + llvm::any_of(MI.operands(), [&](const MachineOperand &Op) { + return Op.isMBB() && Op.getMBB() == Succ; + })) + return true; + return false; +} + +bool RegisterCoalescer::coalescedLiveIntervalMayCrossSetjmp( + LiveInterval &SrcLI, LiveInterval &DstLI) { + for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) { + MachineBasicBlock *MBB = &*I; + // If MBB's address is taken, then it might be the destination of a longjmp. + // Check if Src or Dst are live into the block. + if (MBB->hasAddressTaken() && + (LIS->isLiveInToMBB(SrcLI, MBB) || LIS->isLiveInToMBB(DstLI, MBB))) { + // Check the predecessors of MBB for a terminator that might be a + // EH_SjLj_Setup, and check if Src and Dest are live out of that + // predecessor. + for (MachineBasicBlock *Pred : MBB->predecessors()) + if (blockMayContainSetjmpSetup(Pred, MBB) && + (LIS->isLiveOutOfMBB(SrcLI, Pred) || + LIS->isLiveOutOfMBB(DstLI, Pred))) + // Guess that the coalesced liveness range would cross this edge from + // the setjmp. + return true; + } + } + return false; +} + bool RegisterCoalescer::joinCopy( MachineInstr *CopyMI, bool &Again, SmallPtrSetImpl &CurrentErasedInstrs) { @@ -1986,6 +2030,13 @@ bool RegisterCoalescer::joinCopy( std::swap(SrcIdx, DstIdx); std::swap(SrcRC, DstRC); } + if (MF->exposesReturnsTwice() && + coalescedLiveIntervalMayCrossSetjmp(LIS->getInterval(CP.getSrcReg()), + LIS->getInterval(CP.getDstReg()))) { + LLVM_DEBUG( + dbgs() << "\tNot coalescing: liveness ranges may cross setjmp.\n"); + return false; + } if (!TRI->shouldCoalesce(CopyMI, SrcRC, SrcIdx, DstRC, DstIdx, CP.getNewRC(), *LIS)) { LLVM_DEBUG(dbgs() << "\tSubtarget bailed on coalescing.\n"); @@ -4202,7 +4253,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) { // // TODO: Could specifically disable coalescing registers live across setjmp // calls - if (fn.exposesReturnsTwice()) { + if (fn.exposesOpaqueReturnsTwice()) { LLVM_DEBUG( dbgs() << "* Skipped as it exposes functions that returns twice.\n"); return false; diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index 8f5b05b662b33a9..130eae58585762c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -127,6 +127,16 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, } } + // If the function might be stolen, then several optimizations involving SP + // and FP aren't generally allowed. For example, the Cilk runtime system + // might change the stack a function uses after it performs a spawn, meaning + // that SP can't be used to index stack variables or temporary storage. The + // semantics for the stack memory of such a function most closely resemble + // those of a function with dynamic allocas, so we simply set this flag in + // MachineFrameInfo. + if (Fn->hasFnAttribute(Attribute::Stealable)) + MF->getFrameInfo().setHasVarSizedObjects(); + // Initialize the mapping of values to registers. This is only set up for // instruction values that are used outside of the block that defines // them. diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index f44added89a7283..f5f4451f75c187b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -204,6 +204,8 @@ static void RemoveUnusedGlue(SDNode *N, SelectionDAG *DAG) { void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) { SDValue Chain; unsigned NumOps = Node->getNumOperands(); + if (NumOps == 0) + return; if (Node->getOperand(NumOps-1).getValueType() == MVT::Other) Chain = Node->getOperand(NumOps-1); if (!Chain) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 37b1131d2f8a337..c0a31f9af2957ef 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3372,6 +3372,20 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) { DAG.setRoot(DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops)); break; } + case Intrinsic::detached_rethrow: + // Treat detached_rethrow intrinsics like resumes. + llvm_unreachable("SelectionDAGBuilder shouldn't visit detached_rethrow " + "instructions!"); + break; + case Intrinsic::taskframe_resume: + // Treat detached_rethrow intrinsics like resumes. + llvm_unreachable("SelectionDAGBuilder shouldn't visit taskframe_resume " + "instructions!"); + break; + case Intrinsic::sync_unwind: + // Treat sync_unwind intrinsics like donothing: ignore them and jump + // directly to the next BB. + break; } } else if (I.hasDeoptState()) { // Currently we do not lower any intrinsic calls with deopt operand bundles. @@ -3554,6 +3568,65 @@ void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) { DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot())); } +void SelectionDAGBuilder::visitDetach(const DetachInst &I) { + MachineBasicBlock *DetachMBB = FuncInfo.MBB; + + // Update machine-CFG edges. + MachineBasicBlock *Detached = FuncInfo.MBBMap[I.getSuccessor(0)]; + //MachineBasicBlock *Continue = FuncInfo.MBBMap[I.getSuccessor(1)]; + + // Update machine-CFG edges. + DetachMBB->addSuccessor(Detached); + + // If this is not a fall-through branch or optimizations are switched off, + // emit the branch. + if (Detached != NextBlock(DetachMBB) || TM.getOptLevel() == CodeGenOpt::None) + DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), + MVT::Other, getControlRoot(), + DAG.getBasicBlock(Detached))); + + return; + +} + +void SelectionDAGBuilder::visitReattach(const ReattachInst &I) { + MachineBasicBlock *ReattachMBB = FuncInfo.MBB; + + // Update machine-CFG edges. + MachineBasicBlock *Continue = FuncInfo.MBBMap[I.getSuccessor(0)]; + + // Update machine-CFG edges. + ReattachMBB->addSuccessor(Continue); + + // If this is not a fall-through branch or optimizations are switched off, + // emit the branch. + if (Continue != NextBlock(ReattachMBB) || TM.getOptLevel() == CodeGenOpt::None) + DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), + MVT::Other, getControlRoot(), + DAG.getBasicBlock(Continue))); + + return; +} + +void SelectionDAGBuilder::visitSync(const SyncInst &I) { + MachineBasicBlock *SyncMBB = FuncInfo.MBB; + + // Update machine-CFG edges. + MachineBasicBlock *Continue = FuncInfo.MBBMap[I.getSuccessor(0)]; + + // Update machine-CFG edges. + SyncMBB->addSuccessor(Continue); + + // If this is not a fall-through branch or optimizations are switched off, + // emit the branch. + if (Continue != NextBlock(SyncMBB) || TM.getOptLevel() == CodeGenOpt::None) + DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), + MVT::Other, getControlRoot(), + DAG.getBasicBlock(Continue))); + + return; +} + void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) { SDNodeFlags Flags; if (auto *FPOp = dyn_cast(&I)) @@ -8128,6 +8201,36 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, visitVectorHistogram(I, Intrinsic); return; } + // Tapir intrinsics + case Intrinsic::syncregion_start: + // Lower the starting point of a Tapir sync region to a no-op. + return; + case Intrinsic::taskframe_load_guard: + // Discard any taskframe.load.guards. + return; + case Intrinsic::taskframe_create: + // Discard any taskframe.creates. + return; + case Intrinsic::taskframe_use: + // Discard any taskframe.uses. + return; + case Intrinsic::taskframe_end: + // Discard any taskframe.ends. + return; + case Intrinsic::sync_unwind: + // Discard any sync.unwinds. + return; + case Intrinsic::tapir_runtime_start: + // Discard any tapir.runtime.starts. + return; + case Intrinsic::tapir_runtime_end: + // Discard any tapir.runtime.ends. + return; + case Intrinsic::task_frameaddress: + setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl, + TLI.getFrameIndexTy(DAG.getDataLayout()), + getValue(I.getArgOperand(0)))); + return; } } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 1a98fbd7589fbc0..273da1102d9ee4b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -58,6 +58,7 @@ class Constant; class ConstrainedFPIntrinsic; class DbgValueInst; class DataLayout; +class DetachInst; class DIExpression; class DILocalVariable; class DILocation; @@ -74,6 +75,7 @@ class LLVMContext; class LoadInst; class MachineBasicBlock; class PHINode; +class ReattachInst; class ResumeInst; class ReturnInst; class SDDbgValue; @@ -81,6 +83,7 @@ class SelectionDAG; class StoreInst; class SwiftErrorValueTracking; class SwitchInst; +class SyncInst; class TargetLibraryInfo; class TargetMachine; class Type; @@ -511,6 +514,9 @@ class SelectionDAGBuilder { void visitCatchRet(const CatchReturnInst &I); void visitCatchPad(const CatchPadInst &I); void visitCleanupPad(const CleanupPadInst &CPI); + void visitDetach(const DetachInst& I); + void visitReattach(const ReattachInst& I); + void visitSync(const SyncInst& I); BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index b961d3bb1fec7f6..336553fa08bb068 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -795,6 +795,22 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { } } + // Determine if there is a call to setjmp in the machine function. + MF->setExposesReturnsTwice(Fn.callsFunctionThatReturnsTwice()); + + // Determine if there is a call to a function that returns twice that is not a + // call to the eh.sjlj.setjmp intrinsic. + for (const Instruction &I : instructions(Fn)) + if (const auto *Call = dyn_cast(&I)) + if (Call->hasFnAttr(Attribute::ReturnsTwice)) { + if (const Function *Called = Call->getCalledFunction()) + if (Called->getIntrinsicID() == + Intrinsic::eh_sjlj_setjmp) + continue; + MF->setExposesOpaqueReturnsTwice(true); + break; + } + // Determine if floating point is used for msvc computeUsesMSVCFloatingPoint(TM.getTargetTriple(), Fn, MF->getMMI()); diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp index 239572bf773e8dd..049f4c1ec9cafe4 100644 --- a/llvm/lib/CodeGen/ShrinkWrap.cpp +++ b/llvm/lib/CodeGen/ShrinkWrap.cpp @@ -989,7 +989,8 @@ bool ShrinkWrap::isShrinkWrapEnabled(const MachineFunction &MF) { !(MF.getFunction().hasFnAttribute(Attribute::SanitizeAddress) || MF.getFunction().hasFnAttribute(Attribute::SanitizeThread) || MF.getFunction().hasFnAttribute(Attribute::SanitizeMemory) || - MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress)); + MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress) || + MF.getFunction().hasFnAttribute(Attribute::SanitizeCilk)); // If EnableShrinkWrap is set, it takes precedence on whatever the // target sets. The rational is that we assume we want to test // something related to shrink-wrapping. diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp index c5fa4e6211a6310..f2f585203301d63 100644 --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -80,6 +80,12 @@ static cl::opt "same time) to consider tail duplicating blocks."), cl::init(16), cl::Hidden); +// 0 = disable, 1 = enable CBZ optimization, 2 = increase block size threshold +static cl::opt TailDupCBZ( + "tail-dup-cbz", + cl::desc("More aggressive merging of blocks ending with cbz"), + cl::init(2), cl::Hidden); + static cl::opt TailDupVerify("tail-dup-verify", cl::desc("Verify sanity of PHI instructions during taildup"), @@ -164,7 +170,7 @@ static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) { /// all Preds that received a copy of \p MBB. /// \p RemovalCallback - if non-null, called just before MBB is deleted. bool TailDuplicator::tailDuplicateAndUpdate( - bool IsSimple, MachineBasicBlock *MBB, + const BlockDesc &Desc, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl *DuplicatedPreds, function_ref *RemovalCallback, @@ -175,7 +181,7 @@ bool TailDuplicator::tailDuplicateAndUpdate( SmallVector TDBBs; SmallVector Copies; - if (!tailDuplicate(IsSimple, MBB, ForcedLayoutPred, + if (!tailDuplicate(Desc, MBB, ForcedLayoutPred, TDBBs, Copies, CandidatePtr)) return false; @@ -273,6 +279,14 @@ bool TailDuplicator::tailDuplicateAndUpdate( return true; } +BlockDesc TailDuplicator::getBlockDesc(MachineBasicBlock *MBB) { + BlockDesc Desc; + Desc.IsSimple = isSimpleBB(MBB); + if (TailDupCBZ > 0) + Desc.BRNZ = TII->isZeroTest(*MBB); + return Desc; +} + /// Look for small blocks that are unconditionally branched to and do not fall /// through. Tail-duplicate their instructions into their predecessors to /// eliminate (dynamic) branches. @@ -289,12 +303,12 @@ bool TailDuplicator::tailDuplicateBlocks() { if (NumTails == TailDupLimit) break; - bool IsSimple = isSimpleBB(&MBB); + BlockDesc Desc = getBlockDesc(&MBB); - if (!shouldTailDuplicate(IsSimple, MBB)) + if (!shouldTailDuplicate(Desc, MBB)) continue; - MadeChange |= tailDuplicateAndUpdate(IsSimple, &MBB, nullptr); + MadeChange |= tailDuplicateAndUpdate(Desc, &MBB, nullptr); } if (PreRegAlloc && TailDupVerify) @@ -562,12 +576,12 @@ void TailDuplicator::updateSuccessorsPHIs( } /// Determine if it is profitable to duplicate this block. -bool TailDuplicator::shouldTailDuplicate(bool IsSimple, +bool TailDuplicator::shouldTailDuplicate(const BlockDesc &Desc, MachineBasicBlock &TailBB) { // When doing tail-duplication during layout, the block ordering is in flux, // so canFallThrough returns a result based on incorrect information and // should just be ignored. - if (!LayoutMode && TailBB.canFallThrough()) + if (!LayoutMode && !Desc.BRNZ && TailBB.canFallThrough()) return false; // Don't try to tail-duplicate single-block loops. @@ -592,6 +606,8 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple, MaxDuplicateCount = TailDuplicateSize; else MaxDuplicateCount = TailDupSize; + if (Desc.BRNZ) + MaxDuplicateCount += (TailDupCBZ > 1) + Desc.BRNZ.value().IsKill; if (OptForSize) MaxDuplicateCount = 1; @@ -690,7 +706,7 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple, if (HasIndirectbr && PreRegAlloc) return true; - if (IsSimple) + if (Desc.IsSimple) return true; if (!PreRegAlloc) @@ -832,6 +848,33 @@ bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB, return true; } +static bool Contains(const SmallVectorImpl &Regs, Register Key) { + for (Register Reg : Regs) { + if (Key == Reg) + return true; + } + return false; +} + +static bool SafeToDelete(const MachineInstr &MI) { + if (MI.hasUnmodeledSideEffects() || MI.mayStore() || MI.isCall() || + MI.hasOrderedMemoryRef()) + return false; + if (MI.getNumDefs() <= 1) + return true; + bool SawFirst = false; + unsigned Ops = MI.getNumOperands(); + for (unsigned I = 0; I < Ops; ++I) { + const MachineOperand &MO = MI.getOperand(I); + if (MO.isDef() && !MO.isDead()) { + if (SawFirst) + return false; + SawFirst = true; + } + } + return true; +} + /// If it is profitable, duplicate TailBB's contents in each /// of its predecessors. /// \p IsSimple result of isSimpleBB @@ -842,7 +885,8 @@ bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB, /// into. /// \p Copies A vector of copy instructions inserted. Used later to /// walk all the inserted copies and remove redundant ones. -bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB, +bool TailDuplicator::tailDuplicate(const BlockDesc &Desc, + MachineBasicBlock *TailBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, SmallVectorImpl &Copies, @@ -855,7 +899,7 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB, DenseSet UsedByPhi; getRegsUsedByPHIs(*TailBB, &UsedByPhi); - if (IsSimple) + if (Desc.IsSimple) return duplicateSimpleBB(TailBB, TDBBs, UsedByPhi); // Iterate through all the unique predecessors and tail-duplicate this @@ -875,10 +919,42 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB, if (!canTailDuplicate(TailBB, PredBB)) continue; + int64_t PredValue = 0; + MachineInstr *RegSet = nullptr; + // If Live is true, the value produced by RegSet is used other + // than by a conditional branch. + bool Live = false; // liveness of RegSet + const BlockBRNZ *BRNZ = Desc.BRNZ ? &Desc.BRNZ.value() : nullptr; + if (BRNZ) { + Live = !BRNZ->IsKill; + const TargetRegisterInfo *TRI = MF->getRegInfo().getTargetRegisterInfo(); + // Search backwards for an instruction that sets any of the + // registers in Desc.Regs + for (MachineBasicBlock::reverse_iterator MI = PredBB->instr_rbegin(); + MI != PredBB->instr_rend(); ++MI) { + Register Dest; + if (TII->isSetConstant(*MI, Dest, PredValue) + && Contains(BRNZ->Regs, Dest)) { + RegSet = &*MI; + Live = Live || !SafeToDelete(*MI); + break; + } + for (Register Reg : BRNZ->Regs) { + if (MI->modifiesRegister(Reg, TRI) || MI->killsRegister(Reg, TRI)) { + goto loop_exit; // double break + } + if (!Live && MI->readsRegister(Reg, TRI)) { + Live = true; + } + } + } + loop_exit:; + } + // Don't duplicate into a fall-through predecessor (at least for now). // If profile is available, findDuplicateCandidates can choose better // fall-through predecessor. - if (!(MF->getFunction().hasProfileData() && LayoutMode)) { + if (!RegSet && !(MF->getFunction().hasProfileData() && LayoutMode)) { bool IsLayoutSuccessor = false; if (ForcedLayoutPred) IsLayoutSuccessor = (ForcedLayoutPred == PredBB); @@ -896,6 +972,18 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB, // Remove PredBB's unconditional branch. TII->removeBranch(*PredBB); + // If RegSet is true the tail block branch becomes unconditional. + MachineBasicBlock *Succ = nullptr; + if (RegSet) { + if (!Live) { + PredBB->erase(RegSet); + RegSet = nullptr; + } + Succ = PredValue ? BRNZ->Nonzero : BRNZ->Zero; + if (!Succ) + Succ = TailBB->getFallThrough(); + } + // Clone the contents of TailBB into PredBB. DenseMap LocalVRMap; SmallVector, 4> CopyInfos; @@ -918,8 +1006,15 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB, PredBB->removeSuccessor(PredBB->succ_begin()); assert(PredBB->succ_empty() && "TailDuplicate called on block with multiple successors!"); - for (MachineBasicBlock *Succ : TailBB->successors()) - PredBB->addSuccessor(Succ, MBPI->getEdgeProbability(TailBB, Succ)); + if (Succ) { + TII->removeBranchAndFlags(*PredBB); + TII->insertUnconditionalBranch(*PredBB, Succ, + TailBB->rbegin()->getDebugLoc()); + PredBB->addSuccessor(Succ, BranchProbability::getOne()); + } else { + for (MachineBasicBlock *Succ : TailBB->successors()) + PredBB->addSuccessor(Succ, MBPI->getEdgeProbability(TailBB, Succ)); + } // Update branches in pred to jump to tail's layout successor if needed. if (ShouldUpdateTerminators) diff --git a/llvm/lib/CodeGen/TapirCleanup.cpp b/llvm/lib/CodeGen/TapirCleanup.cpp new file mode 100644 index 000000000000000..bd1422ec29f308c --- /dev/null +++ b/llvm/lib/CodeGen/TapirCleanup.cpp @@ -0,0 +1,101 @@ +//===- TapirCleanup - Cleanup leftover Tapir tasks for code generation ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass serializes any remaining Tapir instructions before code generation. +// Typically this pass should have no effect, because Tapir instructions should +// have been lowered already to a particular parallel runtime. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "tapircleanup" + +STATISTIC(NumTasksSerialized, "Number of Tapir tasks serialized"); +STATISTIC(NumTaskFramesErased, "Number of taskframes erased"); + +namespace { +class TapirCleanup : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid. + + TapirCleanup() : FunctionPass(ID) {} + + bool runOnFunction(Function &Fn) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + StringRef getPassName() const override { + return "Tapir last-minute cleanup for CodeGen"; + } +}; +} // end anonymous namespace + +char TapirCleanup::ID = 0; + +INITIALIZE_PASS_BEGIN(TapirCleanup, DEBUG_TYPE, + "Cleanup Tapir", false, false) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_END(TapirCleanup, DEBUG_TYPE, + "Cleanup Tapir", false, false) + +FunctionPass *llvm::createTapirCleanupPass() { return new TapirCleanup(); } + +void TapirCleanup::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); +} + +bool TapirCleanup::runOnFunction(Function &F) { + TaskInfo &TI = getAnalysis().getTaskInfo(); + auto &ORE = getAnalysis().getORE(); + + bool Changed = false; + + // If we haven't lowered the Tapir task to a particular parallel runtime by + // this point, simply serialize the task. + for (Task *T : post_order(TI.getRootTask())) { + if (T->isRootTask()) + continue; + ORE.emit(DiagnosticInfoOptimizationFailure(DEBUG_TYPE, "CleanedUpTapir", + T->getDetach()->getDebugLoc(), + T->getDetach()->getParent()) + << "CodeGen found Tapir instructions to serialize. Specify a " + "Tapir back-end to lower Tapir instructions to a parallel " + "runtime."); + + SerializeDetach(T->getDetach(), T); + NumTasksSerialized++; + Changed = true; + } + + // Get the set of taskframes to erase. + SmallVector TaskFramesToErase; + for (BasicBlock &BB : F) + for (Instruction &I : BB) + if (isTapirIntrinsic(Intrinsic::taskframe_create, &I)) + TaskFramesToErase.push_back(&I); + + for (Instruction *TFCreate : TaskFramesToErase) { + eraseTaskFrame(TFCreate); + ++NumTaskFramesErased; + Changed = true; + } + + return Changed; +} diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 2be7fc90a0e75ed..190b850dc2ac529 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1770,6 +1770,9 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const { case CatchPad: return 0; case CatchSwitch: return 0; case CleanupPad: return 0; + case Detach: return 0; + case Reattach: return 0; + case Sync: return 0; case FNeg: return ISD::FNEG; case Add: return ISD::ADD; case FAdd: return ISD::FADD; diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 3658e8320a0ccd5..cabc8e527777ebf 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -856,6 +856,9 @@ void TargetPassConfig::addIRPasses() { // Make sure that no unreachable blocks are instruction selected. addPass(createUnreachableBlockEliminationPass()); + // Make sure there are no remaining Tapir instructions. + addPass(createTapirCleanupPass()); + // Prepare expensive constants for SelectionDAG. if (getOptLevel() != CodeGenOptLevel::None && !DisableConstantHoisting) addPass(createConstantHoistingPass()); diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 6599730590de603..74c2e5be597065d 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -4284,6 +4284,33 @@ void AssemblyWriter::printInstruction(const Instruction &I) { writeOperand(BI.getSuccessor(0), true); Out << ", "; writeOperand(BI.getSuccessor(1), true); + } else if (isa(I)) { + // Special case detach instruction to get formatting nice and correct + const DetachInst &DI(cast(I)); + Out << " within "; + writeOperand(DI.getSyncRegion(), /*PrintType=*/false); + Out << ", "; + writeOperand(DI.getDetached(), true); + Out << ", "; + writeOperand(DI.getContinue(), true); + if (DI.hasUnwindDest()) { + Out << " unwind "; + writeOperand(DI.getUnwindDest(), true); + } + } else if (isa(I)) { + // Special case reattach instruction to get formatting nice and correct + const ReattachInst &RI(cast(I)); + Out << " within "; + writeOperand(RI.getSyncRegion(), /*PrintType=*/false); + Out << ", "; + writeOperand(RI.getSuccessor(0), true); + } else if (isa(I)) { + // Special case sync instruction to get formatting nice and correct + const SyncInst &SI(cast(I)); + Out << " within "; + writeOperand(SI.getSyncRegion(), /*PrintType=*/false); + Out << ", "; + writeOperand(SI.getSuccessor(0), true); } else if (isa(I)) { const SwitchInst& SI(cast(I)); diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp index 46896d3cdf7d50a..95e3f2e7c287875 100644 --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -411,6 +411,25 @@ BasicBlock::getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp) const { return nullptr; } +const Instruction * +BasicBlock::getFirstNonPHIOrDbgOrSyncUnwind(bool SkipPseudoOp) const { + for (const Instruction &I : *this) { + if (isa(I) || isa(I)) + continue; + + if (SkipPseudoOp && isa(I)) + continue; + + if (auto *CB = dyn_cast_or_null(&I)) + if (const Function *Called = CB->getCalledFunction()) + if (Intrinsic::sync_unwind == Called->getIntrinsicID()) + continue; + + return &I; + } + return nullptr; +} + BasicBlock::const_iterator BasicBlock::getFirstInsertionPt() const { const Instruction *FirstNonPHI = getFirstNonPHI(); if (!FirstNonPHI) diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index cf7bbf6b2576f00..ba0c240ff408447 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -4201,6 +4201,34 @@ LLVMValueRef LLVMBuildFCmp(LLVMBuilderRef B, LLVMRealPredicate Op, unwrap(LHS), unwrap(RHS), Name)); } +/*--.. Parallel constructs .................................................--*/ + +LLVMValueRef LLVMBuildDetach(LLVMBuilderRef B, + LLVMBasicBlockRef DetachBB, + LLVMBasicBlockRef ContinueBB, + LLVMValueRef SyncRegion) +{ + return wrap(unwrap(B)->CreateDetach(unwrap(DetachBB), + unwrap(ContinueBB), + unwrap(SyncRegion))); +} + +LLVMValueRef LLVMBuildReattach(LLVMBuilderRef B, + LLVMBasicBlockRef ReattachBB, + LLVMValueRef SyncRegion) +{ + return wrap(unwrap(B)->CreateReattach(unwrap(ReattachBB), + unwrap(SyncRegion))); +} + +LLVMValueRef LLVMBuildSync(LLVMBuilderRef B, + LLVMBasicBlockRef ContinueBB, + LLVMValueRef SyncRegion) +{ + return wrap(unwrap(B)->CreateSync(unwrap(ContinueBB), + unwrap(SyncRegion))); +} + /*--.. Miscellaneous instructions ..........................................--*/ LLVMValueRef LLVMBuildPhi(LLVMBuilderRef B, LLVMTypeRef Ty, const char *Name) { diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 7fa1f9696d43b2d..4cfe7a5fbbb5441 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -120,7 +120,7 @@ findDbgIntrinsics(SmallVectorImpl &Result, Value *V, } }; - if (auto *L = LocalAsMetadata::getIfExists(V)) { + if (auto *L = ValueAsMetadata::getIfExists(V)) { AppendUsers(L); for (Metadata *AL : L->getAllArgListUsers()) { AppendUsers(AL); diff --git a/llvm/lib/IR/EHPersonalities.cpp b/llvm/lib/IR/EHPersonalities.cpp index 7c32601b8a83eae..8578e6d1813c4ae 100644 --- a/llvm/lib/IR/EHPersonalities.cpp +++ b/llvm/lib/IR/EHPersonalities.cpp @@ -43,6 +43,7 @@ EHPersonality llvm::classifyEHPersonality(const Value *Pers) { .Case("__gxx_wasm_personality_v0", EHPersonality::Wasm_CXX) .Case("__xlcxx_personality_v1", EHPersonality::XL_CXX) .Case("__zos_cxx_personality_v2", EHPersonality::ZOS_CXX) + .Case("__cilk_personality_v0", EHPersonality::Cilk_CXX) .Default(EHPersonality::Unknown); } @@ -76,6 +77,8 @@ StringRef llvm::getEHPersonalityName(EHPersonality Pers) { return "__xlcxx_personality_v1"; case EHPersonality::ZOS_CXX: return "__zos_cxx_personality_v2"; + case EHPersonality::Cilk_CXX: + return "__cilk_personality_v0"; case EHPersonality::Unknown: llvm_unreachable("Unknown EHPersonality!"); } diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index 6f0f3f244c050ca..e5d0ba944aa5aad 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -712,6 +712,9 @@ const char *Instruction::getOpcodeName(unsigned OpCode) { case CatchPad: return "catchpad"; case CatchSwitch: return "catchswitch"; case CallBr: return "callbr"; + case Detach: return "detach"; + case Reattach: return "reattach"; + case Sync: return "sync"; // Standard unary operators... case FNeg: return "fneg"; @@ -932,6 +935,7 @@ bool Instruction::mayReadFromMemory() const { case Instruction::VAArg: case Instruction::Load: case Instruction::Fence: // FIXME: refine definition of mayReadFromMemory + case Instruction::Sync: // Like Instruction::Fence case Instruction::AtomicCmpXchg: case Instruction::AtomicRMW: case Instruction::CatchPad: @@ -950,6 +954,7 @@ bool Instruction::mayWriteToMemory() const { switch (getOpcode()) { default: return false; case Instruction::Fence: // FIXME: refine definition of mayWriteToMemory + case Instruction::Sync: // Like Instruction::Fence case Instruction::Store: case Instruction::VAArg: case Instruction::AtomicCmpXchg: @@ -1160,6 +1165,15 @@ bool Instruction::isDebugOrPseudoInst() const { return isa(this) || isa(this); } +bool Instruction::isTaskFrameMarker() const { + auto II = dyn_cast(this); + if (!II) + return false; + Intrinsic::ID ID = II->getIntrinsicID(); + return ID == Intrinsic::taskframe_create || ID == Intrinsic::taskframe_use || + ID == Intrinsic::taskframe_end || ID == Intrinsic::taskframe_resume; +} + const Instruction * Instruction::getNextNonDebugInstruction(bool SkipPseudoOp) const { for (const Instruction *I = getNextNode(); I; I = I->getNextNode()) diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 7a8cf8c23049861..8dd6878035e6317 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -1121,6 +1121,180 @@ UnreachableInst::UnreachableInst(LLVMContext &Context, : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr, 0, InsertBefore) {} +//===----------------------------------------------------------------------===// +// DetachInst Implementation +//===----------------------------------------------------------------------===// + +void DetachInst::AssertOK() { + assert(getSyncRegion()->getType()->isTokenTy() && + "Sync region must be a token!"); +} + +void DetachInst::init(Value *SyncRegion, BasicBlock *Detached, + BasicBlock *Continue, BasicBlock *Unwind) { + Op<-1>() = SyncRegion; + Op<-2>() = Detached; + Op<-3>() = Continue; + if (Unwind) { + setSubclassData(true); + Op<-4>() = Unwind; + } +#ifndef NDEBUG + AssertOK(); +#endif +} + +DetachInst::DetachInst(BasicBlock *Detached, BasicBlock *Continue, + Value *SyncRegion, Instruction *InsertBefore) + : Instruction(Type::getVoidTy(Detached->getContext()), + Instruction::Detach, + OperandTraits::op_end(this) - 3, 3, + InsertBefore) { + init(SyncRegion, Detached, Continue); +} + +DetachInst::DetachInst(BasicBlock *Detached, BasicBlock *Continue, + Value *SyncRegion, BasicBlock *InsertAtEnd) + : Instruction(Type::getVoidTy(Detached->getContext()), + Instruction::Detach, + OperandTraits::op_end(this) - 3, 3, + InsertAtEnd) { + init(SyncRegion, Detached, Continue); +} + +DetachInst::DetachInst(BasicBlock *Detached, BasicBlock *Continue, + BasicBlock *Unwind, Value *SyncRegion, + Instruction *InsertBefore) + : Instruction(Type::getVoidTy(Detached->getContext()), + Instruction::Detach, + OperandTraits::op_end(this) - 4, 4, + InsertBefore) { + init(SyncRegion, Detached, Continue, Unwind); +} + +DetachInst::DetachInst(BasicBlock *Detached, BasicBlock *Continue, + BasicBlock *Unwind, Value *SyncRegion, + BasicBlock *InsertAtEnd) + : Instruction(Type::getVoidTy(Detached->getContext()), + Instruction::Detach, + OperandTraits::op_end(this) - 4, 4, + InsertAtEnd) { + init(SyncRegion, Detached, Continue, Unwind); +} + +DetachInst::DetachInst(const DetachInst &DI) + : Instruction(Type::getVoidTy(DI.getContext()), Instruction::Detach, + OperandTraits::op_end(this) - + DI.getNumOperands(), + DI.getNumOperands()) { + setSubclassData( + DI.getSubclassData()); + Op<-1>() = DI.Op<-1>(); + Op<-2>() = DI.Op<-2>(); + Op<-3>() = DI.Op<-3>(); + if (DI.hasUnwindDest()) { + Op<-4>() = DI.Op<-4>(); + assert(DI.getNumOperands() == 4 && "Detach must have 4 operands!"); + } else + assert(DI.getNumOperands() == 3 && "Detach must have 3 operands!"); +} + +LandingPadInst *DetachInst::getLandingPadInst() const { + if (!hasUnwindDest()) + return nullptr; + return cast(getUnwindDest()->getFirstNonPHI()); +} + +//===----------------------------------------------------------------------===// +// ReattachInst Implementation +//===----------------------------------------------------------------------===// + +void ReattachInst::AssertOK() { + assert(getSyncRegion()->getType()->isTokenTy() && + "Sync region must be a token!"); +} + +ReattachInst::ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion, + Instruction *InsertBefore) + : Instruction(Type::getVoidTy(DetachContinue->getContext()), + Instruction::Reattach, + OperandTraits::op_end(this) - 2, 2, + InsertBefore) { + Op<-1>() = SyncRegion; + Op<-2>() = DetachContinue; +#ifndef NDEBUG + AssertOK(); +#endif +} + +ReattachInst::ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion, + BasicBlock *InsertAtEnd) + : Instruction(Type::getVoidTy(DetachContinue->getContext()), + Instruction::Reattach, + OperandTraits::op_end(this) - 2, 2, + InsertAtEnd) { + Op<-1>() = SyncRegion; + Op<-2>() = DetachContinue; +#ifndef NDEBUG + AssertOK(); +#endif +} + +ReattachInst::ReattachInst(const ReattachInst &RI) + : Instruction(Type::getVoidTy(RI.getContext()), Instruction::Reattach, + OperandTraits::op_end(this) - + RI.getNumOperands(), + RI.getNumOperands()) { + Op<-1>() = RI.Op<-1>(); + Op<-2>() = RI.Op<-2>(); + assert(RI.getNumOperands() == 2 && "Reattach must have 2 operands!"); + SubclassOptionalData = RI.SubclassOptionalData; +} + +//===----------------------------------------------------------------------===// +// SyncInst Implementation +//===----------------------------------------------------------------------===// + +void SyncInst::AssertOK() { + assert(getSyncRegion()->getType()->isTokenTy() && + "Sync region must be a token!"); +} + +SyncInst::SyncInst(BasicBlock *Continue, Value *SyncRegion, + Instruction *InsertBefore) + : Instruction(Type::getVoidTy(Continue->getContext()), Instruction::Sync, + OperandTraits::op_end(this) - 2, 2, + InsertBefore) { + Op<-1>() = SyncRegion; + Op<-2>() = Continue; +#ifndef NDEBUG + AssertOK(); +#endif +} + +SyncInst::SyncInst(BasicBlock *Continue, Value *SyncRegion, + BasicBlock *InsertAtEnd) + : Instruction(Type::getVoidTy(Continue->getContext()), Instruction::Sync, + OperandTraits::op_end(this) - 2, 2, + InsertAtEnd) { + Op<-1>() = SyncRegion; + Op<-2>() = Continue; +#ifndef NDEBUG + AssertOK(); +#endif +} + + +SyncInst::SyncInst(const SyncInst &SI) + : Instruction(Type::getVoidTy(SI.getContext()), Instruction::Sync, + OperandTraits::op_end(this) - SI.getNumOperands(), + SI.getNumOperands()) { + Op<-1>() = SI.Op<-1>(); + Op<-2>() = SI.Op<-2>(); + assert(SI.getNumOperands() == 2 && "Sync must have 2 operands!"); + SubclassOptionalData = SI.SubclassOptionalData; +} + //===----------------------------------------------------------------------===// // BranchInst Implementation //===----------------------------------------------------------------------===// @@ -4397,3 +4571,15 @@ UnreachableInst *UnreachableInst::cloneImpl() const { FreezeInst *FreezeInst::cloneImpl() const { return new FreezeInst(getOperand(0)); } + +DetachInst *DetachInst::cloneImpl() const { + return new(getNumOperands()) DetachInst(*this); +} + +ReattachInst *ReattachInst::cloneImpl() const { + return new(getNumOperands()) ReattachInst(*this); +} + +SyncInst *SyncInst::cloneImpl() const { + return new(getNumOperands()) SyncInst(*this); +} diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index db3b0196f66fd69..c1a01e0617ae234 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -60,6 +60,7 @@ bool IntrinsicInst::mayLowerToFunctionCall(Intrinsic::ID IID) { case Intrinsic::objc_retain_autorelease: case Intrinsic::objc_sync_enter: case Intrinsic::objc_sync_exit: + case Intrinsic::tapir_loop_grainsize: return true; default: return false; diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index 5c61ad9f000b030..55342099821defa 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -442,6 +442,13 @@ bool StructType::containsHomogeneousScalableVectorTypes() const { return true; } +StructType *StructType::lookupOrCreate(LLVMContext &Context, StringRef Name) { + StructType *Ty = Context.pImpl->NamedStructTypes.lookup(Name); + if (!Ty) + Ty = StructType::create(Context, Name); + return Ty; +} + void StructType::setBody(ArrayRef Elements, bool isPacked) { assert(isOpaque() && "Struct body already set!"); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index c5c407637cbf347..bf0b7da61dbdaa0 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -382,6 +382,10 @@ class Verifier : public InstVisitor, VerifierSupport { // Keeps track of duplicate function argument debug info. SmallVector DebugFnArgs; + // Keeps track of detach instructions whose task structures have been + // verified. + SmallPtrSet DetachesVisited; + TBAAVerifier TBAAVerifyHelper; ConvergenceVerifier ConvergenceVerifyHelper; @@ -606,6 +610,10 @@ class Verifier : public InstVisitor, VerifierSupport { void visitCatchSwitchInst(CatchSwitchInst &CatchSwitch); void visitCleanupReturnInst(CleanupReturnInst &CRI); + void verifyTask(const DetachInst *DI); + void visitDetachInst(DetachInst &DI); + void visitReattachInst(ReattachInst &RI); + void verifySwiftErrorCall(CallBase &Call, const Value *SwiftErrorVal); void verifySwiftErrorValue(const Value *SwiftErrorVal); void verifyTailCCMustTailAttrs(const AttrBuilder &Attrs, StringRef Context); @@ -3190,6 +3198,121 @@ void Verifier::visitCallBrInst(CallBrInst &CBI) { visitTerminator(CBI); } +// Check if the given instruction is an intrinsic with the specified ID. If a +// value \p V is specified, then additionally checks that the first argument of +// the intrinsic matches \p V. +static bool isTapirIntrinsic(Intrinsic::ID ID, const Instruction *I, + const Value *V) { + if (const CallBase *CB = dyn_cast(I)) + if (const Function *Called = CB->getCalledFunction()) + if (ID == Called->getIntrinsicID()) + if (!V || (V == CB->getArgOperand(0))) + return true; + return false; +} + +/// Returns true if the given instruction performs a detached.rethrow, false +/// otherwise. If \p SyncRegion is specified, then additionally checks that the +/// detached.rethrow uses \p SyncRegion. +static bool isDetachedRethrow(const Instruction *I, + const Value *SyncRegion = nullptr) { + return isa(I) && + isTapirIntrinsic(Intrinsic::detached_rethrow, I, SyncRegion); +} + +void Verifier::verifyTask(const DetachInst *DI) { + SmallVector Worklist; + SmallPtrSet Visited; + Worklist.push_back(DI->getDetached()); + do { + const BasicBlock *BB = Worklist.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + if (const DetachInst *SDI = dyn_cast(BB->getTerminator())) { + Check(DI != SDI, "Detached task reaches its own detach", DI); + if (DetachesVisited.insert(SDI).second) + // Recursively verify the detached task. + verifyTask(SDI); + + // Add the continuation and unwind destination to the worklist. + Worklist.push_back(SDI->getContinue()); + if (SDI->hasUnwindDest()) + Worklist.push_back(SDI->getUnwindDest()); + continue; + } + + if (const ReattachInst *RI = dyn_cast(BB->getTerminator())) { + Check(DI->getSyncRegion() == RI->getSyncRegion(), + "Mismatched sync regions between detach and reattach", DI, RI); + Check(RI->getDetachContinue() == DI->getContinue(), + "Mismatched continuations between detach and reattach", DI, RI); + // Don't add the successor of the reattach, since that's outside of the + // task. + continue; + } + + if (const InvokeInst *II = dyn_cast(BB->getTerminator())) { + if (isDetachedRethrow(II)) { + Check(DI->getSyncRegion() == II->getArgOperand(0), + "Mismatched sync regions between detach and detached.rethrow", DI, + II); + Check(isa(II->getNormalDest()->getTerminator()), + "detached.rethrow intrinsic has an " + "unexpected normal destination.", + DI, II); + Check(DI->hasUnwindDest(), + "Task contains a detached.rethrow terminator, but detach has no " + "unwind destination", + DI, II); + Check(DI->getUnwindDest() == II->getUnwindDest(), + "Mismatched unwind destinations between detach and " + "detached.rethrow", + DI, II); + // Don't add the successors of the detached.rethrow, since they're + // outside of the task. + continue; + } + } + + // Check that do not encounter a return or resume in the middle of the + // task. + Check(!isa(BB->getTerminator()) && + !isa(BB->getTerminator()), + "Unexpected return or resume in task", BB->getTerminator()); + + // Add the successors of this basic block. + for (const BasicBlock *Successor : successors(BB)) + Worklist.push_back(Successor); + + } while (!Worklist.empty()); +} + +void Verifier::visitReattachInst(ReattachInst &RI) { + if (DT.isReachableFromEntry(RI.getParent())) { + // Check that the continuation of the reattach has a detach predecessor. + const BasicBlock *Continue = RI.getDetachContinue(); + bool FoundDetachPred = false; + for (const BasicBlock *Pred : predecessors(Continue)) { + if (isa(Pred->getTerminator()) && + DT.dominates(Pred, RI.getParent())) { + FoundDetachPred = true; + break; + } + } + Check(FoundDetachPred, + "No detach predecessor found for successor of reattach.", &RI); + } + visitTerminator(RI); +} + +void Verifier::visitDetachInst(DetachInst &DI) { + if (DetachesVisited.insert(&DI).second) + verifyTask(&DI); + + visitTerminator(DI); +} + void Verifier::visitSelectInst(SelectInst &SI) { Check(!SelectInst::areInvalidOperands(SI.getOperand(0), SI.getOperand(1), SI.getOperand(2)), @@ -4385,6 +4508,14 @@ void Verifier::visitEHPadPredecessors(Instruction &I) { // landing pad block may be branched to only by the unwind edge of an // invoke. for (BasicBlock *PredBB : predecessors(BB)) { + if (const auto *DI = dyn_cast(PredBB->getTerminator())) { + Check(DI && DI->getUnwindDest() == BB && DI->getDetached() != BB && + DI->getContinue() != BB, + "A detach can only jump to a block containing a LandingPadInst " + "as the unwind destination.", + LPI); + continue; + } const auto *II = dyn_cast(PredBB->getTerminator()); Check(II && II->getUnwindDest() == BB && II->getNormalDest() != BB, "Block containing LandingPadInst must be jumped to " @@ -5120,9 +5251,13 @@ void Verifier::visitInstruction(Instruction &I) { F->getIntrinsicID() == Intrinsic::experimental_patchpoint || F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint || F->getIntrinsicID() == Intrinsic::wasm_rethrow || + F->getIntrinsicID() == Intrinsic::detached_rethrow || + F->getIntrinsicID() == Intrinsic::taskframe_resume || + F->getIntrinsicID() == Intrinsic::sync_unwind || IsAttachedCallOperand(F, CBI, i), "Cannot invoke an intrinsic other than donothing, patchpoint, " - "statepoint, coro_resume, coro_destroy or clang.arc.attachedcall", + "statepoint, coro_resume, coro_destroy, detached_rethrow, " + "taskframe_resume, sync_unwind or clang.arc.attachedcall", &I); Check(F->getParent() == &M, "Referencing function in another module!", &I, &M, F, F->getParent()); @@ -6322,6 +6457,18 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { "llvm.ptrmask intrinsic second argument bitwidth must match " "pointer index type size of first argument", &Call); + case Intrinsic::syncregion_start: { + SmallVector DetachUsers; + for (const User *U : Call.users()) + if (const DetachInst *DI = dyn_cast(U)) + if (DT.isReachableFromEntry(DI->getParent())) + DetachUsers.push_back(DI); + + for (const DetachInst *DI1 : DetachUsers) + for (const DetachInst *DI2 : DetachUsers) + if (DI1 != DI2) + Check(!DT.dominates(DI1->getDetached(), DI2->getParent()), + "One detach user of a sync region dominates another", DI1, DI2); break; } case Intrinsic::threadlocal_address: { diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index bb3c9f7acdb8e5c..7e4af03c3113900 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -152,6 +152,7 @@ void llvm::computeLTOCacheKey( AddUnsigned(static_cast(Conf.CGFileType)); AddUnsigned(Conf.OptLevel); AddUnsigned(Conf.Freestanding); + AddUnsigned(static_cast(Conf.TapirTarget)); AddString(Conf.OptPipeline); AddString(Conf.AAPipeline); AddString(Conf.OverrideTriple); diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index d5d642f0d25e6cf..21a2e157067b065 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -230,6 +230,11 @@ createTargetMachine(const Config &Conf, const Target *TheTarget, Module &M) { return TM; } +static bool hasTapirTarget(const Config &Conf) { + return (Conf.TapirTarget != TapirTargetID::Last_TapirTargetID) && + (Conf.TapirTarget != TapirTargetID::None); +} + static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM, unsigned OptLevel, bool IsThinLTO, ModuleSummaryIndex *ExportSummary, @@ -274,6 +279,10 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM, std::unique_ptr TLII( new TargetLibraryInfoImpl(Triple(TM->getTargetTriple()))); + TLII->setTapirTarget(Conf.TapirTarget); + TLII->setTapirTargetOptions( + std::make_unique(Conf.OpenCilkABIBitcodeFile)); + TLII->addTapirTargetLibraryFunctions(); if (Conf.Freestanding) TLII->disableAllFunctions(); FAM.registerPass([&] { return TargetLibraryAnalysis(*TLII); }); @@ -327,9 +336,11 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM, Conf.OptPipeline + "': " + toString(std::move(Err))); } } else if (IsThinLTO) { - MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary)); + MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary, + hasTapirTarget(Conf))); } else { - MPM.addPass(PB.buildLTODefaultPipeline(OL, ExportSummary)); + MPM.addPass(PB.buildLTODefaultPipeline(OL, ExportSummary, + hasTapirTarget(Conf))); } if (!Conf.DisableVerify) @@ -408,6 +419,10 @@ static void codegen(const Config &Conf, TargetMachine *TM, legacy::PassManager CodeGenPasses; TargetLibraryInfoImpl TLII(Triple(Mod.getTargetTriple())); + TLII.setTapirTarget(Conf.TapirTarget); + TLII.setTapirTargetOptions( + std::make_unique(Conf.OpenCilkABIBitcodeFile)); + TLII.addTapirTargetLibraryFunctions(); CodeGenPasses.add(new TargetLibraryInfoWrapperPass(TLII)); CodeGenPasses.add( createImmutableModuleSummaryIndexWrapperPass(&CombinedIndex)); diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt index 6425f4934b21034..a0e1ac9bbf222ac 100644 --- a/llvm/lib/Passes/CMakeLists.txt +++ b/llvm/lib/Passes/CMakeLists.txt @@ -28,6 +28,7 @@ add_llvm_component_library(LLVMPasses ObjCARC Scalar Support + TapirOpts Target TransformUtils Vectorize diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 5dbb1e2f498716b..fac875a2b4be0b0 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -29,6 +29,7 @@ #include "llvm/Analysis/CallPrinter.h" #include "llvm/Analysis/CostModel.h" #include "llvm/Analysis/CycleAnalysis.h" +#include "llvm/Analysis/DataRaceFreeAliasAnalysis.h" #include "llvm/Analysis/DDG.h" #include "llvm/Analysis/DDGPrinter.h" #include "llvm/Analysis/Delinearization.h" @@ -68,6 +69,8 @@ #include "llvm/Analysis/StackLifetime.h" #include "llvm/Analysis/StackSafetyAnalysis.h" #include "llvm/Analysis/StructuralHash.h" +#include "llvm/Analysis/TapirRaceDetect.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/TypeBasedAliasAnalysis.h" @@ -183,6 +186,8 @@ #include "llvm/Transforms/Instrumentation/AddressSanitizer.h" #include "llvm/Transforms/Instrumentation/BoundsChecking.h" #include "llvm/Transforms/Instrumentation/CGProfile.h" +#include "llvm/Transforms/Instrumentation/CilkSanitizer.h" +#include "llvm/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.h" #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" @@ -279,6 +284,11 @@ #include "llvm/Transforms/Scalar/TLSVariableHoist.h" #include "llvm/Transforms/Scalar/TailRecursionElimination.h" #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" +#include "llvm/Transforms/Tapir/LoopSpawningTI.h" +#include "llvm/Transforms/Tapir/LoopStripMinePass.h" +#include "llvm/Transforms/Tapir/SerializeSmallTasks.h" +#include "llvm/Transforms/Tapir/TapirToTarget.h" +#include "llvm/Transforms/Tapir/DRFScopedNoAliasAA.h" #include "llvm/Transforms/Utils/AddDiscriminators.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" #include "llvm/Transforms/Utils/BreakCriticalEdges.h" @@ -313,6 +323,8 @@ #include "llvm/Transforms/Utils/UnifyLoopExits.h" #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" +#include "llvm/Transforms/Utils/TaskCanonicalize.h" +#include "llvm/Transforms/Utils/TaskSimplify.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" @@ -320,8 +332,9 @@ using namespace llvm; -static const Regex DefaultAliasRegex( - "^(default|thinlto-pre-link|thinlto|lto-pre-link|lto)<(O[0123sz])>$"); +static const Regex + DefaultAliasRegex("^(default|thinlto-pre-link|thinlto|lto-pre-link|lto|" + "tapir-lowering|tapir-lowering-loops)<(O[0123sz])>$"); namespace llvm { cl::opt PrintPipelinePasses( @@ -1212,8 +1225,8 @@ parseRegAllocFastPassOptions(PassBuilder &PB, StringRef Params) { /// Tests whether a pass name starts with a valid prefix for a default pipeline /// alias. static bool startsWithDefaultPipelineAliasPrefix(StringRef Name) { - return Name.starts_with("default") || Name.starts_with("thinlto") || - Name.starts_with("lto"); + return Name.startswith("default") || Name.startswith("thinlto") || + Name.startswith("lto") || Name.startswith("tapir-lowering"); } /// Tests whether registered callbacks will accept a given pass name. @@ -1525,6 +1538,10 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, MPM.addPass(buildThinLTOPreLinkDefaultPipeline(L)); else MPM.addPass(buildLTOPreLinkDefaultPipeline(L)); + } else if (Matches[1] == "tapir-lowering-loops") { + MPM.addPass(buildTapirLoopLoweringPipeline(L, ThinOrFullLTOPhase::None)); + } else if (Matches[1] == "tapir-lowering") { + MPM.addPass(buildTapirLoweringPipeline(L, ThinOrFullLTOPhase::None)); } else { assert(Matches[1] == "lto" && "Not one of the matched options!"); MPM.addPass(buildLTODefaultPipeline(L, nullptr)); diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 6f36bdad780ae35..40bae8c31e017be 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -18,12 +18,14 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CGSCCPassManager.h" +#include "llvm/Analysis/DataRaceFreeAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InlineAdvisor.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/Analysis/TypeBasedAliasAnalysis.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/Verifier.h" #include "llvm/Passes/OptimizationLevel.h" #include "llvm/Passes/PassBuilder.h" #include "llvm/Support/CommandLine.h" @@ -124,6 +126,11 @@ #include "llvm/Transforms/Scalar/SpeculativeExecution.h" #include "llvm/Transforms/Scalar/TailRecursionElimination.h" #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" +#include "llvm/Transforms/Tapir/LoopSpawningTI.h" +#include "llvm/Transforms/Tapir/LoopStripMinePass.h" +#include "llvm/Transforms/Tapir/SerializeSmallTasks.h" +#include "llvm/Transforms/Tapir/TapirToTarget.h" +#include "llvm/Transforms/Tapir/DRFScopedNoAliasAA.h" #include "llvm/Transforms/Utils/AddDiscriminators.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" #include "llvm/Transforms/Utils/CanonicalizeAliases.h" @@ -136,6 +143,8 @@ #include "llvm/Transforms/Utils/NameAnonGlobals.h" #include "llvm/Transforms/Utils/RelLookupTableConverter.h" #include "llvm/Transforms/Utils/SimplifyCFGOptions.h" +#include "llvm/Transforms/Utils/TaskCanonicalize.h" +#include "llvm/Transforms/Utils/TaskSimplify.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" @@ -309,10 +318,16 @@ extern cl::opt EnableMemProfContextDisambiguation; extern cl::opt EnableInferAlignmentPass; } // namespace llvm +static cl::opt + VerifyTapirLowering("verify-tapir-lowering-npm", cl::init(false), + cl::Hidden, + cl::desc("Verify IR after Tapir lowering steps")); + PipelineTuningOptions::PipelineTuningOptions() { LoopInterleaving = true; LoopVectorization = true; SLPVectorization = false; + LoopStripmine = true; LoopUnrolling = true; ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll; LicmMssaOptCap = SetLicmMssaOptCap; @@ -388,6 +403,16 @@ void PassBuilder::invokePipelineEarlySimplificationEPCallbacks( for (auto &C : PipelineEarlySimplificationEPCallbacks) C(MPM, Level); } +void PassBuilder::invokeTapirLateEPCallbacks(ModulePassManager &MPM, + OptimizationLevel Level) { + for (auto &C : TapirLateEPCallbacks) + C(MPM, Level); +} +void PassBuilder::invokeTapirLoopEndEPCallbacks(ModulePassManager &MPM, + OptimizationLevel Level) { + for (auto &C : TapirLoopEndEPCallbacks) + C(MPM, Level); +} // Helper to add AnnotationRemarksPass. static void addAnnotationRemarksPass(ModulePassManager &MPM) { @@ -420,6 +445,7 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, // Hoisting of scalars and load expressions. FPM.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); + FPM.addPass(TaskSimplifyPass()); FPM.addPass(InstCombinePass()); FPM.addPass(LibCallsShrinkWrapPass()); @@ -428,6 +454,7 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); + FPM.addPass(TaskSimplifyPass()); // Form canonically associated expression trees, and simplify the trees using // basic mathematical properties. For example, this will form (nearly) @@ -497,6 +524,7 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, /*UseBlockFrequencyInfo=*/true)); FPM.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); + FPM.addPass(TaskSimplifyPass()); FPM.addPass(InstCombinePass()); // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. // *All* loop passes must preserve it, in order to be able to use it. @@ -535,6 +563,7 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(ADCEPass()); FPM.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); + FPM.addPass(TaskSimplifyPass()); FPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(FPM, Level); @@ -589,6 +618,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); + FPM.addPass(TaskSimplifyPass()); FPM.addPass(InstCombinePass()); FPM.addPass(AggressiveInstCombinePass()); @@ -606,6 +636,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(TailCallElimPass()); FPM.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); + FPM.addPass(TaskSimplifyPass()); // Form canonically associated expression trees, and simplify the trees using // basic mathematical properties. For example, this will form (nearly) @@ -688,6 +719,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, /*UseBlockFrequencyInfo=*/true)); FPM.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); + FPM.addPass(TaskSimplifyPass()); FPM.addPass(InstCombinePass()); // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass, // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA. @@ -757,6 +789,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, .convertSwitchRangeToICmp(true) .hoistCommonInsts(true) .sinkCommonInsts(true))); + FPM.addPass(TaskSimplifyPass()); FPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(FPM, Level); @@ -1153,6 +1186,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, invokePeepholeEPCallbacks(GlobalCleanupPM, Level); GlobalCleanupPM.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); + GlobalCleanupPM.addPass(TaskSimplifyPass()); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM), PTO.EagerlyInvalidateAnalyses)); @@ -1289,6 +1323,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, /*UseBlockFrequencyInfo=*/true)); ExtraPasses.addPass( SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); + // Cleanup tasks after the loop optimization passes. + ExtraPasses.addPass(TaskSimplifyPass()); ExtraPasses.addPass(InstCombinePass()); FPM.addPass(std::move(ExtraPasses)); } @@ -1326,6 +1362,9 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, // Enhance/cleanup vector code. FPM.addPass(VectorCombinePass()); + // Rerun EarlyCSE for further cleanup. + FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); + if (!IsFullLTO) { FPM.addPass(InstCombinePass()); // Unroll small loops to hide loop backedge latency and saturate any @@ -1468,6 +1507,33 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // rather than on each loop in an inside-out manner, and so they are actually // function passes. + // Stripmine Tapir loops, if pass is enabled. + if (PTO.LoopStripmine && Level != OptimizationLevel::O1 && + !Level.isOptimizingForSize()) { + LoopPassManager LPM1, LPM2; + LPM1.addPass(TapirIndVarSimplifyPass()); + OptimizePM.addPass( + createFunctionToLoopPassAdaptor(std::move(LPM1), + /*UseMemorySSA=*/true, + /*UseBlockFrequencyInfo=*/true)); + OptimizePM.addPass(LoopStripMinePass()); + // Cleanup tasks after stripmining loops. + OptimizePM.addPass(TaskSimplifyPass()); + // Cleanup after stripmining loops. + LPM2.addPass(LoopSimplifyCFGPass()); + LPM2.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); + OptimizePM.addPass( + createFunctionToLoopPassAdaptor(std::move(LPM2), + /*UseMemorySSA=*/true, + /*UseBlockFrequencyInfo=*/true)); + // Don't run IndVarSimplify at this point, as it can actually inhibit + // vectorization in some cases. + OptimizePM.addPass(JumpThreadingPass()); + OptimizePM.addPass(CorrelatedValuePropagationPass()); + OptimizePM.addPass(InstCombinePass()); + } + invokeVectorizerStartEPCallbacks(OptimizePM, Level); LoopPassManager LPM; @@ -1519,6 +1585,9 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, .convertSwitchRangeToICmp(true) .speculateUnpredictables(true))); + // Cleanup tasks as well. + OptimizePM.addPass(TaskSimplifyPass()); + // Add the core optimizing pipeline. MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM), PTO.EagerlyInvalidateAnalyses)); @@ -1563,11 +1632,184 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, return MPM; } +ModulePassManager +PassBuilder::buildTapirLoopLoweringPipeline(OptimizationLevel Level, + ThinOrFullLTOPhase Phase) { + ModulePassManager MPM; + + LoopPassManager LPM1, LPM2; + + if (Level == OptimizationLevel::O0) + // Form SSA out of local memory accesses. + MPM.addPass( + createModuleToFunctionPassAdaptor(SROAPass(SROAOptions::ModifyCFG))); + + // Rotate Loop - disable header duplication at -Oz + LPM1.addPass(LoopRotatePass(Level != OptimizationLevel::Oz)); + LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); + LPM2.addPass(IndVarSimplifyPass()); + + FunctionPassManager FPM; + // The loop pass in LPM2 (IndVarSimplifyPass) does not preserve MemorySSA. + // *All* loop passes must preserve it, in order to be able to use it. + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), + /*UseMemorySSA=*/true, + /*UseBlockFrequencyInfo=*/true)); + FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp( + true))); // Merge & remove basic blocks. + FPM.addPass(InstCombinePass()); + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), + /*UseMemorySSA=*/false, + /*UseBlockFrequencyInfo=*/false)); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + + // Outline Tapir loops as needed. + MPM.addPass(LoopSpawningPass()); + if (VerifyTapirLowering) + MPM.addPass(VerifierPass()); + + // The LoopSpawning pass may leave cruft around. Clean it up using the + // function simplification pipeline. + if (Level != OptimizationLevel::O0) + MPM.addPass( + createModuleToFunctionPassAdaptor( + buildFunctionSimplificationPipeline(Level, Phase))); + + return MPM; +} + +ModulePassManager +PassBuilder::buildTapirLoweringPipeline(OptimizationLevel Level, + ThinOrFullLTOPhase Phase) { + ModulePassManager MPM; + + if (Level == OptimizationLevel::O0) { + // At -O0, simply translate the Tapir constructs and run always-inline. In + // particular, don't run loop-spawning. + + // Add passes to run just after Tapir loops are (or would be) processed. + for (auto &C : TapirLoopEndEPCallbacks) + C(MPM, Level); + + // Lower Tapir constructs to target runtime calls. + MPM.addPass(TapirToTargetPass()); + if (VerifyTapirLowering) + MPM.addPass(VerifierPass()); + + MPM.addPass(AlwaysInlinerPass( + /*InsertLifetimeIntrinsics=*/false)); + + return MPM; + } + + // Lower Tapir loops + MPM.addPass(buildTapirLoopLoweringPipeline(Level, Phase)); + + // Add passes to run just after Tapir loops are processed. + invokeTapirLoopEndEPCallbacks(MPM, Level); + + // Canonicalize the representation of tasks. + MPM.addPass(createModuleToFunctionPassAdaptor(TaskCanonicalizePass())); + + // Lower Tapir to target runtime calls. + MPM.addPass(TapirToTargetPass()); + if (VerifyTapirLowering) + MPM.addPass(VerifierPass()); + + // The TapirToTarget pass may leave cruft around. Clean it up using the + // function simplification pipeline. + MPM.addPass( + createModuleToFunctionPassAdaptor( + buildFunctionSimplificationPipeline(Level, Phase))); + + // Interprocedural constant propagation now that basic cleanup has occurred + // and prior to optimizing globals. + // FIXME: This position in the pipeline hasn't been carefully considered in + // years, it should be re-analyzed. + MPM.addPass(IPSCCPPass()); + + // Attach metadata to indirect call sites indicating the set of functions + // they may target at run-time. This should follow IPSCCP. + MPM.addPass(CalledValuePropagationPass()); + + // Optimize globals to try and fold them into constants. + MPM.addPass(GlobalOptPass()); + + // Promote any localized globals to SSA registers. + // FIXME: Should this instead by a run of SROA? + // FIXME: We should probably run instcombine and simplify-cfg afterward to + // delete control flows that are dead once globals have been folded to + // constants. + MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass())); + + // Remove any dead arguments exposed by cleanups and constant folding + // globals. + MPM.addPass(DeadArgumentEliminationPass()); + + // Create a small function pass pipeline to cleanup after all the global + // optimizations. + FunctionPassManager GlobalCleanupPM; + GlobalCleanupPM.addPass(InstCombinePass()); + GlobalCleanupPM.addPass( + SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM))); + + // Synthesize function entry counts for non-PGO compilation. + if (EnableSyntheticCounts) + MPM.addPass(SyntheticCountsPropagation()); + + MPM.addPass(AlwaysInlinerPass( + /*InsertLifetimeIntrinsics=*/false)); + + // Require the GlobalsAA analysis for the module so we can query it within + // the CGSCC pipeline. + MPM.addPass(RequireAnalysisPass()); + + // Begin the postoder CGSCC pipeline. + CGSCCPassManager PostLowerCGPipeline; + + // Now deduce any function attributes based in the current code. + PostLowerCGPipeline.addPass(PostOrderFunctionAttrsPass()); + + // When at O3 add argument promotion to the pass pipeline. + // FIXME: It isn't at all clear why this should be limited to O3. + if (Level == OptimizationLevel::O3) + PostLowerCGPipeline.addPass(ArgumentPromotionPass()); + + // Lastly, add the core function simplification pipeline nested inside the + // CGSCC walk. + PostLowerCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( + buildFunctionSimplificationPipeline(Level, Phase))); + + // We wrap the CGSCC pipeline in a devirtualization repeater. This will try + // to detect when we devirtualize indirect calls and iterate the SCC passes + // in that case to try and catch knock-on inlining or function attrs + // opportunities. Then we add it to the module pipeline by walking the SCCs + // in postorder (or bottom-up). + MPM.addPass( + createModuleToPostOrderCGSCCPassAdaptor(createDevirtSCCRepeatedPass( + std::move(PostLowerCGPipeline), MaxDevirtIterations))); + + // Drop bodies of available eternally objects to improve GlobalDCE. + MPM.addPass(EliminateAvailableExternallyPass()); + + // Do RPO function attribute inference across the module to forward-propagate + // attributes where applicable. + // FIXME: Is this really an optimization rather than a canonicalization? + MPM.addPass(ReversePostOrderFunctionAttrsPass()); + + // Now that we have optimized the program, discard unreachable functions. + MPM.addPass(GlobalDCEPass()); + + return MPM; +} + ModulePassManager PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, - bool LTOPreLink) { + bool LTOPreLink, bool LowerTapir) { if (Level == OptimizationLevel::O0) - return buildO0DefaultPipeline(Level, LTOPreLink); + return buildO0DefaultPipeline(Level, LTOPreLink, LowerTapir); ModulePassManager MPM; @@ -1601,6 +1843,18 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, if (LTOPreLink) addRequiredLTOPreLinkPasses(MPM); + + // Add passes to run just before Tapir lowering. + invokeTapirLateEPCallbacks(MPM, Level); + + // Lower Tapir if necessary + if (LowerTapir) + MPM.addPass(buildTapirLoweringPipeline( + Level, LTOPreLink ? ThinOrFullLTOPhase::FullLTOPreLink + : ThinOrFullLTOPhase::None)); + else + invokeTapirLoopEndEPCallbacks(MPM, Level); + return MPM; } @@ -1681,7 +1935,8 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { } ModulePassManager PassBuilder::buildThinLTODefaultPipeline( - OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) { + OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary, + bool LowerTapir) { ModulePassManager MPM; if (ImportSummary) { @@ -1729,6 +1984,16 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline( MPM.addPass(buildModuleOptimizationPipeline( Level, ThinOrFullLTOPhase::ThinLTOPostLink)); + // Add passes to run just before Tapir lowering. + invokeTapirLateEPCallbacks(MPM, Level); + + // Lower Tapir if necessary + if (LowerTapir) + MPM.addPass( + buildTapirLoweringPipeline(Level, ThinOrFullLTOPhase::ThinLTOPostLink)); + else + invokeTapirLoopEndEPCallbacks(MPM, Level); + // Emit annotation remarks. addAnnotationRemarksPass(MPM); @@ -1744,7 +2009,8 @@ PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) { ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, - ModuleSummaryIndex *ExportSummary) { + ModuleSummaryIndex *ExportSummary, + bool LowerTapir) { ModulePassManager MPM; invokeFullLinkTimeOptimizationEarlyEPCallbacks(MPM, Level); @@ -2055,6 +2321,16 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); + // Add passes to run just before Tapir lowering. + invokeTapirLateEPCallbacks(MPM, Level); + + // Lower Tapir if necessary + if (LowerTapir) + MPM.addPass( + buildTapirLoweringPipeline(Level, ThinOrFullLTOPhase::FullLTOPostLink)); + else + invokeTapirLoopEndEPCallbacks(MPM, Level); + // Emit annotation remarks. addAnnotationRemarksPass(MPM); @@ -2062,7 +2338,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, } ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, - bool LTOPreLink) { + bool LTOPreLink, + bool LowerTapir) { assert(Level == OptimizationLevel::O0 && "buildO0DefaultPipeline should only be used with O0"); @@ -2155,6 +2432,16 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, CoroPM.addPass(GlobalDCEPass()); MPM.addPass(CoroConditionalWrapper(std::move(CoroPM))); + // Add passes to run just before Tapir lowering. + invokeTapirLateEPCallbacks(MPM, Level); + + if (LowerTapir) + MPM.addPass(buildTapirLoweringPipeline( + Level, LTOPreLink ? ThinOrFullLTOPhase::FullLTOPreLink + : ThinOrFullLTOPhase::None)); + else + invokeTapirLoopEndEPCallbacks(MPM, Level); + invokeOptimizerLastEPCallbacks(MPM, Level); if (LTOPreLink) @@ -2165,6 +2452,84 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, return MPM; } +ModulePassManager +PassBuilder::buildPostCilkInstrumentationPipeline(OptimizationLevel Level) { + ModulePassManager MPM; + if (Level != OptimizationLevel::O0) { + FunctionPassManager FPM; + FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); + FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); + FPM.addPass(JumpThreadingPass()); + FPM.addPass(CorrelatedValuePropagationPass()); + FPM.addPass(SimplifyCFGPass()); + FPM.addPass(ReassociatePass()); + LoopPassManager LPM; + // Simplify the loop body. We do this initially to clean up after + // other loop passes run, either when iterating on a loop or on + // inner loops with implications on the outer loop. + LPM.addPass(LoopInstSimplifyPass()); + LPM.addPass(LoopSimplifyCFGPass()); + LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); + LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == + OptimizationLevel::O3)); + FPM.addPass( + RequireAnalysisPass()); + FPM.addPass( + createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true, + /*UseBlockFrequencyInfo=*/true)); + FPM.addPass(SimplifyCFGPass()); + FPM.addPass(InstCombinePass()); + FPM.addPass(SCCPPass()); + FPM.addPass(BDCEPass()); + FPM.addPass(InstCombinePass()); + FPM.addPass(DSEPass()); + FPM.addPass(SimplifyCFGPass()); + FPM.addPass(InstCombinePass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3) { + MPM.addPass(ModuleInlinerWrapperPass( + getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel()))); + // Optimize globals. + MPM.addPass(GlobalOptPass()); + MPM.addPass(GlobalDCEPass()); + FunctionPassManager FPM; + FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); + FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); + FPM.addPass(JumpThreadingPass()); + FPM.addPass(CorrelatedValuePropagationPass()); + FPM.addPass(SimplifyCFGPass()); + FPM.addPass(ReassociatePass()); + LoopPassManager LPM; + // Simplify the loop body. We do this initially to clean up + // after other loop passes run, either when iterating on a loop + // or on inner loops with implications on the outer loop. + LPM.addPass(LoopInstSimplifyPass()); + LPM.addPass(LoopSimplifyCFGPass()); + LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); + FPM.addPass( + RequireAnalysisPass()); + FPM.addPass( + createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true, + /*UseBlockFrequencyInfo=*/true)); + FPM.addPass(SimplifyCFGPass()); + FPM.addPass(InstCombinePass()); + FPM.addPass(SCCPPass()); + FPM.addPass(BDCEPass()); + FPM.addPass(InstCombinePass()); + FPM.addPass(DSEPass()); + FPM.addPass(SimplifyCFGPass()); + FPM.addPass(InstCombinePass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + } + MPM.addPass(EliminateAvailableExternallyPass()); + MPM.addPass(GlobalDCEPass()); + + return MPM; +} + AAManager PassBuilder::buildDefaultAAPipeline() { AAManager AA; @@ -2188,6 +2553,11 @@ AAManager PassBuilder::buildDefaultAAPipeline() { if (EnableGlobalAnalyses) AA.registerModuleAnalysis(); + if (EnableDRFAA) + // Add support for using Tapir parallel control flow to inform alias + // analysis based on the data-race-free assumption. + AA.registerFunctionAnalysis(); + // Add target-specific alias analyses. if (TM) TM->registerDefaultAliasAnalyses(AA); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 3b92823cd283b47..820f88b87531e89 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -48,10 +48,13 @@ MODULE_PASS("attributor-light", AttributorLightPass()) MODULE_PASS("called-value-propagation", CalledValuePropagationPass()) MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass()) MODULE_PASS("check-debugify", NewPMCheckDebugifyPass()) +MODULE_PASS("cilksan", CilkSanitizerPass()) MODULE_PASS("constmerge", ConstantMergePass()) MODULE_PASS("coro-cleanup", CoroCleanupPass()) MODULE_PASS("coro-early", CoroEarlyPass()) MODULE_PASS("cross-dso-cfi", CrossDSOCFIPass()) +MODULE_PASS("csi", ComprehensiveStaticInstrumentationPass()) +MODULE_PASS("csi-setup", CSISetupPass()) MODULE_PASS("deadargelim", DeadArgumentEliminationPass()) MODULE_PASS("debugify", NewPMDebugifyPass()) MODULE_PASS("dfsan", DataFlowSanitizerPass()) @@ -83,6 +86,8 @@ MODULE_PASS("invalidate", InvalidateAllAnalysesPass()) MODULE_PASS("iroutliner", IROutlinerPass()) MODULE_PASS("jmc-instrumenter", JMCInstrumenterPass()) MODULE_PASS("lower-emutls", LowerEmuTLSPass()) +MODULE_PASS("print-ir-similarity", IRSimilarityAnalysisPrinterPass(dbgs())) +MODULE_PASS("loop-spawning", LoopSpawningPass()) MODULE_PASS("lower-global-dtors", LowerGlobalDtorsPass()) MODULE_PASS("lower-ifunc", LowerIFuncPass()) MODULE_PASS("lowertypetests", LowerTypeTestsPass()) @@ -138,6 +143,7 @@ MODULE_PASS("strip-debug-declare", StripDebugDeclarePass()) MODULE_PASS("strip-nondebug", StripNonDebugSymbolsPass()) MODULE_PASS("strip-nonlinetable-debuginfo", StripNonLineTableDebugInfoPass()) MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation()) +MODULE_PASS("tapir2target", TapirToTargetPass()) MODULE_PASS("trigger-crash-module", TriggerCrashModulePass()) MODULE_PASS("trigger-verifier-error", TriggerVerifierErrorPass()) MODULE_PASS("tsan-module", ModuleThreadSanitizerPass()) @@ -286,6 +292,7 @@ FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC)) FUNCTION_ANALYSIS("phi-values", PhiValuesAnalysis()) FUNCTION_ANALYSIS("postdomtree", PostDominatorTreeAnalysis()) FUNCTION_ANALYSIS("regions", RegionInfoAnalysis()) +FUNCTION_ANALYSIS("race-detect", TapirRaceDetect()) FUNCTION_ANALYSIS("scalar-evolution", ScalarEvolutionAnalysis()) FUNCTION_ANALYSIS("should-not-run-function-passes", ShouldNotRunFunctionPassesAnalysis()) @@ -296,6 +303,7 @@ FUNCTION_ANALYSIS("stack-safety-local", StackSafetyAnalysis()) FUNCTION_ANALYSIS("target-ir", TM ? TM->getTargetIRAnalysis() : TargetIRAnalysis()) FUNCTION_ANALYSIS("target-lib-info", TargetLibraryAnalysis()) +FUNCTION_ANALYSIS("tasks", TaskAnalysis()) FUNCTION_ANALYSIS("uniformity", UniformityInfoAnalysis()) FUNCTION_ANALYSIS("verify", VerifierAnalysis()) @@ -308,6 +316,7 @@ FUNCTION_ALIAS_ANALYSIS("objc-arc-aa", objcarc::ObjCARCAA()) FUNCTION_ALIAS_ANALYSIS("scev-aa", SCEVAA()) FUNCTION_ALIAS_ANALYSIS("scoped-noalias-aa", ScopedNoAliasAA()) FUNCTION_ALIAS_ANALYSIS("tbaa", TypeBasedAA()) +FUNCTION_ALIAS_ANALYSIS("drf-aa", DRFAA()) #undef FUNCTION_ALIAS_ANALYSIS #undef FUNCTION_ANALYSIS @@ -350,6 +359,11 @@ FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(TM)) FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(TM)) FUNCTION_PASS("expand-large-fp-convert", ExpandLargeFpConvertPass(TM)) FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM)) +FUNCTION_PASS("drf-scoped-noalias", DRFScopedNoAliasPass()) +FUNCTION_PASS("view-dom", DomViewer()) +FUNCTION_PASS("view-dom-only", DomOnlyViewer()) +FUNCTION_PASS("view-post-dom", PostDomViewer()) +FUNCTION_PASS("view-post-dom-only", PostDomOnlyViewer()) FUNCTION_PASS("fix-irreducible", FixIrreduciblePass()) FUNCTION_PASS("flatten-cfg", FlattenCFGPass()) FUNCTION_PASS("float2int", Float2IntPass()) @@ -401,6 +415,16 @@ FUNCTION_PASS("move-auto-init", MoveAutoInitPass()) FUNCTION_PASS("nary-reassociate", NaryReassociatePass()) FUNCTION_PASS("newgvn", NewGVNPass()) FUNCTION_PASS("no-op-function", NoOpFunctionPass()) +FUNCTION_PASS("jump-threading", JumpThreadingPass()) +FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass()) +FUNCTION_PASS("kcfi", KCFIPass()) +FUNCTION_PASS("lcssa", LCSSAPass()) +FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass()) +FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass()) +FUNCTION_PASS("loop-fusion", LoopFusePass()) +FUNCTION_PASS("loop-distribute", LoopDistributePass()) +FUNCTION_PASS("loop-versioning", LoopVersioningPass()) +FUNCTION_PASS("loop-stripmine", LoopStripMinePass()) FUNCTION_PASS("objc-arc", ObjCARCOptPass()) FUNCTION_PASS("objc-arc-contract", ObjCARCContractPass()) FUNCTION_PASS("objc-arc-expand", ObjCARCExpandPass()) @@ -439,6 +463,18 @@ FUNCTION_PASS("print", PostDominatorTreePrinterPass(dbgs())) FUNCTION_PASS("print", RegionInfoPrinterPass(dbgs())) FUNCTION_PASS("print", ScalarEvolutionPrinterPass(dbgs())) FUNCTION_PASS("print", StackSafetyPrinterPass(dbgs())) +FUNCTION_PASS("print", TapirRaceDetectPrinterPass(dbgs())) +FUNCTION_PASS("print", RegionInfoPrinterPass(dbgs())) +FUNCTION_PASS("print", ScalarEvolutionPrinterPass(dbgs())) +FUNCTION_PASS("print", StackSafetyPrinterPass(dbgs())) +FUNCTION_PASS("print", LoopAccessInfoPrinterPass(dbgs())) +FUNCTION_PASS("print", TaskPrinterPass(dbgs())) +// TODO: rename to print after NPM switch +FUNCTION_PASS("print-alias-sets", AliasSetsPrinterPass(dbgs())) +FUNCTION_PASS("print-cfg-sccs", CFGSCCPrinterPass(dbgs())) +FUNCTION_PASS("print-predicateinfo", PredicateInfoPrinterPass(dbgs())) +FUNCTION_PASS("print-mustexecute", MustExecutePrinterPass(dbgs())) +FUNCTION_PASS("print-memderefs", MemDerefPrinterPass(dbgs())) FUNCTION_PASS("print", UniformityInfoPrinterPass(dbgs())) FUNCTION_PASS("reassociate", ReassociatePass()) FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass()) @@ -446,6 +482,8 @@ FUNCTION_PASS("reg2mem", RegToMemPass()) FUNCTION_PASS("safe-stack", SafeStackPass(TM)) FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass()) FUNCTION_PASS("scalarizer", ScalarizerPass()) +FUNCTION_PASS("separate-const-offset-from-gep", SeparateConstOffsetFromGEPPass()) +FUNCTION_PASS("serialize-small-tasks", SerializeSmallTasksPass()) FUNCTION_PASS("sccp", SCCPPass()) FUNCTION_PASS("select-optimize", SelectOptimizePass(TM)) FUNCTION_PASS("separate-const-offset-from-gep", @@ -463,6 +501,8 @@ FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass()) FUNCTION_PASS("trigger-crash-function", TriggerCrashFunctionPass()) FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass()) FUNCTION_PASS("tsan", ThreadSanitizerPass()) +FUNCTION_PASS("task-canonicalize", TaskCanonicalizePass()) +FUNCTION_PASS("task-simplify", TaskSimplifyPass()) FUNCTION_PASS("typepromotion", TypePromotionPass(TM)) FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass()) FUNCTION_PASS("vector-combine", VectorCombinePass()) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 62078822c89b184..f80e3d149f06563 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1049,6 +1049,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::bf16, Custom); } + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + // Indexed loads and stores are supported. for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { @@ -1188,7 +1191,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, ISD::FSIN, ISD::FCOS, ISD::FTAN, ISD::FASIN, ISD::FACOS, ISD::FATAN, ISD::FSINH, ISD::FCOSH, ISD::FTANH, - ISD::FPOW, ISD::FLOG, ISD::FLOG2, + ISD::FPOW, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10, ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, @@ -1196,7 +1199,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR, ISD::STRICT_FSQRT, ISD::STRICT_FRINT, - ISD::STRICT_FNEARBYINT, ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, + ISD::STRICT_FNEARBYINT, ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM}) setOperationAction(Op, MVT::v1f64, Expand); @@ -2854,6 +2857,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::CTTZ_ELTS) MAKE_CASE(AArch64ISD::CALL_ARM64EC_TO_X64) MAKE_CASE(AArch64ISD::URSHR_I_PRED) + MAKE_CASE(AArch64ISD::EH_SJLJ_SETJMP) + MAKE_CASE(AArch64ISD::EH_SJLJ_LONGJMP) } #undef MAKE_CASE return nullptr; @@ -3210,6 +3215,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( return EmitZero(MI, BB); case AArch64::ZERO_T_PSEUDO: return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true); + case AArch64::AArch64_setjmp_instr: + return EmitSetjmp(MI, BB); + case AArch64::AArch64_longjmp_instr: + return EmitLongjmp(MI, BB); } } @@ -7085,6 +7094,10 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerFLDEXP(Op, DAG); case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return LowerVECTOR_HISTOGRAM(Op, DAG); + case ISD::EH_SJLJ_SETJMP: + return LowerSetjmp(Op, DAG); + case ISD::EH_SJLJ_LONGJMP: + return LowerLongjmp(Op, DAG); } } @@ -28761,3 +28774,187 @@ void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const { } } #endif + +SDValue AArch64TargetLowering::LowerSetjmp(SDValue Op, + SelectionDAG &DAG) const { + return DAG.getNode(AArch64ISD::EH_SJLJ_SETJMP, SDLoc(Op), + DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), + Op.getOperand(1)); +} + +SDValue AArch64TargetLowering::LowerLongjmp(SDValue Op, + SelectionDAG &DAG) const { + return DAG.getNode(AArch64ISD::EH_SJLJ_LONGJMP, SDLoc(Op), MVT::Other, + Op.getOperand(0), Op.getOperand(1)); +} + +MachineBasicBlock * +AArch64TargetLowering::EmitSetjmp(MachineInstr &MI, + MachineBasicBlock *MBB) const { + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const AArch64RegisterInfo *TRI = + &Subtarget->getInstrInfo()->getRegisterInfo(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + DebugLoc DL = MI.getDebugLoc(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = ++MBB->getIterator(); + + // Memory Reference + SmallVector MMOs(MI.memoperands_begin(), + MI.memoperands_end()); + + Register DstReg = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); + Register mainDstReg = MRI.createVirtualRegister(RC); + Register restoreDstReg = MRI.createVirtualRegister(RC); + Register AddrReg = MI.getOperand(1).getReg(); + + MVT PVT = getPointerTy(MF->getDataLayout()); + assert(PVT == MVT::i64 && "Invalid Pointer Size!"); + + // For v = setjmp(buf), we generate + // + // thisMBB: + // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB + // SjLjSetup restoreMBB + // + // mainMBB: + // v_main = 0 + // + // sinkMBB: + // v = phi(main, restore) + // + // restoreMBB: + // v_restore = 1 + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + MF->push_back(restoreMBB); + restoreMBB->setIsEHPad(true); + + MachineInstrBuilder MIB; + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + unsigned LabelReg = 0; + + // TODO: The four stores generated by setjmp should be merged into two + // pairs. They are generated out of order by two separate blocks of + // code (0+2 by machine independent code and 1+3 here). + + // Calculate resume address. ADR has +/- 1 MB range. + LabelReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + MIB = BuildMI(*thisMBB, MI, DL, TII->get(AArch64::ADR), LabelReg) + .addMBB(restoreMBB); + MIB = BuildMI(*thisMBB, MI, DL, TII->get(AArch64::STRXui)); + MIB.addReg(LabelReg); + MIB.addReg(AddrReg); + MIB.addImm(1); // scaled by word size + MIB.setMemRefs(MMOs); + + MIB = BuildMI(*thisMBB, MI, DL, TII->get(AArch64::STRXui)); + MIB.addReg(TRI->hasBasePointer(*MF) ? TRI->getBaseRegister() : AArch64::XZR); + MIB.addReg(AddrReg); + MIB.addImm(3); // scaled by word size + + // x86 has cf-protection-return check here + + // Add a special terminator instruction to make the resume block reachable. + MIB = BuildMI(*thisMBB, MI, DL, TII->get(AArch64::EH_SjLj_Setup)) + .addMBB(restoreMBB); + // TODO: This unnecessarily flushes registers on the fallthrough + // path even though only restoreMBB loses register state. The data + // loss needs to be added to the edge. Putting the register mask in + // the destination block is too late because the compiler will put + // spills of already-invalid registers before the invalidation note. + MIB.addRegMask(MRI.getTargetRegisterInfo()->getNoPreservedMask()); + // For now these successors should not have branch probabilities. + // Although mainMBB is much more likely, adding probabilities causes + // poor code generation later, in part by suppressing tail duplication. + thisMBB->addSuccessor(mainMBB); + thisMBB->addSuccessor(restoreMBB); + + // mainMBB: dst = 0 << 0 + BuildMI(mainMBB, DL, TII->get(AArch64::MOVZWi), mainDstReg) + .addImm(0) + .addImm(0); + BuildMI(mainMBB, DL, TII->get(AArch64::B)).addMBB(sinkMBB); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(AArch64::PHI), DstReg) + .addReg(mainDstReg) + .addMBB(mainMBB) + .addReg(restoreDstReg) + .addMBB(restoreMBB); + + // restoreMBB: dst = 1 << 0 + BuildMI(restoreMBB, DL, TII->get(AArch64::MOVZWi), restoreDstReg) + .addImm(1) + .addImm(0); + BuildMI(restoreMBB, DL, TII->get(AArch64::B)).addMBB(sinkMBB); + restoreMBB->addSuccessor(sinkMBB); + + MI.eraseFromParent(); + return sinkMBB; +} + +MachineBasicBlock * +AArch64TargetLowering::EmitLongjmp(MachineInstr &MI, + MachineBasicBlock *MBB) const { + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const AArch64RegisterInfo *TRI = static_cast( + MF->getSubtarget().getRegisterInfo()); + DebugLoc DL = MI.getDebugLoc(); + Register PC = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + Register AddrReg = MI.getOperand(0).getReg(); + Register StackReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + MachineInstrBuilder MIB; + + // The frame pointer is overwritten by the first load so + // copy it to a temporary register if necessary. + if (AddrReg == AArch64::FP || AddrReg == AArch64::SP) { + Register AddrTmp = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + MIB = BuildMI(*MBB, MI, DL, TII->get(AArch64::ORRXri), AddrTmp); + MIB.addReg(AddrReg); + MIB.addImm(0); + AddrReg = AddrTmp; + MI.getOperand(0).ChangeToRegister(AddrTmp, false, false, true); + } + + // FP, PC + MIB = BuildMI(*MBB, MI, DL, TII->get(AArch64::LDPXi)); + MIB.addReg(AArch64::FP, RegState::Define); + MIB.addReg(PC, RegState::Define); + MIB.addReg(AddrReg); + MIB.addImm(0); // scaled by word size + // SP (indirectly) and X19 + // X19 may be used as the base pointer for an over-aligned stack frame. + // If not, the setjmp restore block does not expect values in X19 to be live. + MIB = BuildMI(*MBB, MI, DL, TII->get(AArch64::LDPXi)); + MIB.addReg(StackReg, RegState::Define); + MIB.addReg(TRI->getBaseRegister(), RegState::Define); + MIB.addReg(AddrReg); + MIB.addImm(2); // scaled by word size + MIB = BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDXri), AArch64::SP); + MIB.addReg(StackReg); + MIB.addImm(0); // immediate + MIB.addImm(0); // shift count + MIB = BuildMI(*MBB, MI, DL, TII->get(AArch64::BR)); + MIB.addReg(PC); + MI.eraseFromParent(); + return MBB; +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 81e15185f985d50..f5d0898c68b9d88 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -470,6 +470,10 @@ enum NodeType : unsigned { // chain = MSRR(chain, sysregname, lo64, hi64) MSRR, + // Builtin setjmp and longjmp + EH_SJLJ_SETJMP, + EH_SJLJ_LONGJMP, + // Strict (exception-raising) floating point comparison STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, STRICT_FCMPE, @@ -1213,6 +1217,8 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const; + SDValue LowerSetjmp(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLongjmp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op, SelectionDAG &DAG) const; @@ -1356,6 +1362,10 @@ class AArch64TargetLowering : public TargetLowering { unsigned getMinimumJumpTableEntries() const override; bool softPromoteHalfType() const override { return true; } + + MachineBasicBlock *EmitSetjmp(MachineInstr &MI, MachineBasicBlock *MBB) const; + MachineBasicBlock *EmitLongjmp(MachineInstr &MI, + MachineBasicBlock *MBB) const; }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 805684ef69a5928..0c6165c9483145e 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -10030,6 +10030,84 @@ AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { Init, IsUpdatePriorComp, Cond); } +std::optional +AArch64InstrInfo::isZeroTest(MachineBasicBlock &MBB) const { + const AArch64RegisterInfo *TRI = &getRegisterInfo(); + MachineBasicBlock *U = nullptr, *Zero = nullptr, *Nonzero = nullptr; + + MachineBasicBlock::const_reverse_instr_iterator MI = MBB.instr_rbegin(); + while (MI != MBB.instr_rend() && MI->isUnconditionalBranch()) { + U = getBranchDestBlock(*MI); + ++MI; + } + + if (MI == MBB.instr_rend()) + return std::optional(); + + switch (MI->getOpcode()) { + case AArch64::CBNZW: + case AArch64::CBNZX: + Zero = U; + Nonzero = MI->getOperand(1).getMBB(); + break; + case AArch64::CBZW: + case AArch64::CBZX: + Nonzero = U; + Zero = MI->getOperand(1).getMBB(); + break; + default: + return std::optional(); + } + + BlockBRNZ Desc; + Desc.IsKill = MI->getOperand(0).isKill(); + Desc.Regs.push_back(MI->getOperand(0).getReg()); + Desc.Zero = Zero; + Desc.Nonzero = Nonzero; + + const Register &Reg0 = Desc.Regs[0]; + + while (++MI != MBB.instr_rend()) { + if (MI->isPHI()) { + if (MI->getOperand(0).getReg() == Reg0) { + unsigned NumOperands = MI->getNumOperands(); + for (unsigned I = 1; I < NumOperands; I += 2) { + Desc.Regs.push_back(MI->getOperand(I).getReg()); + } + } + // There should be only one PHI setting the register. + return Desc; + } + if (MI->modifiesRegister(Reg0, TRI)) + return std::optional(); + if (MI->readsRegister(Reg0, TRI)) + Desc.IsKill = false; + } + return Desc; +} + +bool +AArch64InstrInfo::isSetConstant(const MachineInstr &MI, Register &Reg, + int64_t &Value) const { + if (MI.getNumOperands() < 3 || !MI.getOperand(0).isReg()) + return false; + // describeLoadedValue, but ParamLoadedValue is complicated... + switch (MI.getOpcode()) { + case AArch64::MOVZWi: + case AArch64::MOVZXi: { + if (!MI.getOperand(1).isImm()) + return false; + Reg = MI.getOperand(0).getReg(); + int64_t Immediate = MI.getOperand(1).getImm(); + int Shift = MI.getOperand(2).getImm(); + Value = Immediate << Shift; + // range check is easier than worrying about extension and truncation + return (Value & 0x7fffffff) == Value; + } + } + return false; +} + #define GET_INSTRINFO_HELPERS #define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 69ee0a70765e1cf..676b80fc5d43c0d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -427,6 +427,11 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo { bool optimizeCondBranch(MachineInstr &MI) const override; CombinerObjective getCombinerObjective(unsigned Pattern) const override; + + std::optional isZeroTest(MachineBasicBlock &MBB) const override; + bool isSetConstant(const MachineInstr &MI, Register &Reg, + int64_t &Value) const override; + /// Return true when a code sequence can improve throughput. It /// should be called only for instructions in loops. /// \param Pattern - combiner pattern diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 1053ba9242768aa..d8353b95f3a882d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1062,7 +1062,7 @@ def PROBED_STACKALLOC_DYN : Pseudo<(outs), [(AArch64probedalloca GPR64common:$target)]>, Sched<[]>; -} // Defs = [SP, NZCV], Uses = [SP] in +} // Defs = [SP, NZCV], Uses = [SP] in } // hasSideEffects = 1, isCodeGenOnly = 1 let isReMaterializable = 1, isCodeGenOnly = 1 in { @@ -2017,8 +2017,10 @@ def : Pat<(AArch64mrs imm:$id), // The thread pointer (on Linux, at least, where this has been implemented) is // TPIDR_EL0. +let mayLoad = 1 in { def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins), [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>; +} // This gets lowered into a 24-byte instruction sequence let Defs = [ X9, X16, X17, NZCV ], Size = 24 in { @@ -7325,19 +7327,19 @@ def : Pat<(i32 (and (i32 (vector_extract (opNode (v8i16 V128:$Rn)), (i64 0))), } // For vecreduce_add, used by GlobalISel not SDAG -def : Pat<(i8 (vecreduce_add (v8i8 V64:$Rn))), +def : Pat<(i8 (vecreduce_add (v8i8 V64:$Rn))), (i8 (ADDVv8i8v V64:$Rn))>; -def : Pat<(i8 (vecreduce_add (v16i8 V128:$Rn))), +def : Pat<(i8 (vecreduce_add (v16i8 V128:$Rn))), (i8 (ADDVv16i8v V128:$Rn))>; -def : Pat<(i16 (vecreduce_add (v4i16 V64:$Rn))), +def : Pat<(i16 (vecreduce_add (v4i16 V64:$Rn))), (i16 (ADDVv4i16v V64:$Rn))>; -def : Pat<(i16 (vecreduce_add (v8i16 V128:$Rn))), +def : Pat<(i16 (vecreduce_add (v8i16 V128:$Rn))), (i16 (ADDVv8i16v V128:$Rn))>; -def : Pat<(i32 (vecreduce_add (v2i32 V64:$Rn))), +def : Pat<(i32 (vecreduce_add (v2i32 V64:$Rn))), (i32 (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub))>; -def : Pat<(i32 (vecreduce_add (v4i32 V128:$Rn))), +def : Pat<(i32 (vecreduce_add (v4i32 V128:$Rn))), (i32 (ADDVv4i32v V128:$Rn))>; -def : Pat<(i64 (vecreduce_add (v2i64 V128:$Rn))), +def : Pat<(i64 (vecreduce_add (v2i64 V128:$Rn))), (i64 (ADDPv2i64p V128:$Rn))>; defm : SIMDAcrossLanesSignedIntrinsic<"ADDV", AArch64saddv>; @@ -7382,25 +7384,25 @@ def : Pat<(i16 (opNode (v4i16 FPR64:$Rn))), def : Pat<(i16 (opNode (v8i16 FPR128:$Rn))), (!cast(!strconcat(baseOpc, "v8i16v")) FPR128:$Rn)>; -def : Pat<(i32 (opNode (v4i32 V128:$Rn))), +def : Pat<(i32 (opNode (v4i32 V128:$Rn))), (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn)>; } // For v2i32 source type, the pairwise instruction can be used instead defm : SIMDAcrossLanesVecReductionIntrinsic<"UMINV", vecreduce_umin>; -def : Pat<(i32 (vecreduce_umin (v2i32 V64:$Rn))), +def : Pat<(i32 (vecreduce_umin (v2i32 V64:$Rn))), (i32 (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub))>; defm : SIMDAcrossLanesVecReductionIntrinsic<"UMAXV", vecreduce_umax>; -def : Pat<(i32 (vecreduce_umax (v2i32 V64:$Rn))), +def : Pat<(i32 (vecreduce_umax (v2i32 V64:$Rn))), (i32 (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub))>; defm : SIMDAcrossLanesVecReductionIntrinsic<"SMINV", vecreduce_smin>; -def : Pat<(i32 (vecreduce_smin (v2i32 V64:$Rn))), +def : Pat<(i32 (vecreduce_smin (v2i32 V64:$Rn))), (i32 (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub))>; defm : SIMDAcrossLanesVecReductionIntrinsic<"SMAXV", vecreduce_smax>; -def : Pat<(i32 (vecreduce_smax (v2i32 V64:$Rn))), +def : Pat<(i32 (vecreduce_smax (v2i32 V64:$Rn))), (i32 (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub))>; multiclass SIMDAcrossLanesSignedLongIntrinsic { @@ -10262,6 +10264,30 @@ defm : PromoteBinaryv8f16Tov4f32; defm : PromoteBinaryv8f16Tov4f32; defm : PromoteBinaryv8f16Tov4f32; +def AArch64eh_sjlj_setjmp : SDNode<"AArch64ISD::EH_SJLJ_SETJMP", + SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, + [SDNPHasChain, SDNPSideEffect]>; +def AArch64eh_sjlj_longjmp : SDNode<"AArch64ISD::EH_SJLJ_LONGJMP", + SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPSideEffect]>; + +let isCodeGenOnly = 1, usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { + def AArch64_setjmp_instr : Pseudo<(outs GPR32:$dst), (ins GPR64:$buf), + [(set GPR32:$dst, (AArch64eh_sjlj_setjmp GPR64:$buf))]>; + def AArch64_longjmp_instr : Pseudo<(outs), (ins GPR64:$buf), + [(AArch64eh_sjlj_longjmp GPR64:$buf)]>; +} + +// This instruction is needed to make the longjmp target block reachable. +def EH_SjLj_Setup : AArch64Inst { + let isTerminator = 1; + let isCodeGenOnly = 1; + let hasNoSchedulingInfo = 1; + let AsmString = "#EH_SjLj_Setup\t$dst"; + dag OutOperandList = (outs); + dag InOperandList = (ins am_brcond:$dst); +} + include "AArch64InstrAtomics.td" include "AArch64SVEInstrInfo.td" include "AArch64SMEInstrInfo.td" diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 435cc18cdea6249..4110f1ae93e49e8 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -573,6 +573,12 @@ unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; } bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); + // For stealable functions, where the stack pointer can change dramatically + // during execution, the base pointer is the only reliable way to reference + // local variables. + if (MF.getFunction().hasFnAttribute(Attribute::Stealable)) + return true; + // In the presence of variable sized objects or funclets, if the fixed stack // size is large enough that referencing from the FP won't result in things // being in range relatively often, we can use a base pointer to allow access diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index c3e12b6d8024e98..3eede0009ab2a9e 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -708,7 +708,8 @@ void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, return; } - if (MI.getOpcode() == AArch64::SPACE) { + if (MI.getOpcode() == AArch64::SPACE || + MI.getOpcode() == AArch64::EH_SjLj_Setup) { // SPACE just increases basic block size, in both cases no actual code. return; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 45989bcd07d37e1..32bbdb537447b44 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36130,6 +36130,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); // Memory Reference SmallVector MMOs(MI.memoperands_begin(), @@ -36162,6 +36163,18 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, thisMBB = emitLongJmpShadowStackFix(MI, thisMBB); } + // Copy stack addresses to a temporary register. + if (MI.getOperand(0).isFI() || MI.readsRegister(FP, TRI) || + MI.readsRegister(SP, TRI)) { + Register AddrTmp = MRI.createVirtualRegister(RC); + unsigned LEA = (PVT == MVT::i64) ? X86::LEA64r : X86::LEA32r; + MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(LEA), AddrTmp); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + MIB.add(MI.getOperand(i)); + } + MI.getOperand(0).ChangeToRegister(AddrTmp, false, false, true); + } + // Reload FP MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index fab7c167e385f9e..7682dec28eead81 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4015,22 +4015,68 @@ bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const { + return removeBranchImpl(MBB, BytesRemoved, false); +} + +unsigned X86InstrInfo::removeBranchAndFlags(MachineBasicBlock &MBB, + int *BytesRemoved) const { + return removeBranchImpl(MBB, BytesRemoved, true); +} + +unsigned X86InstrInfo::removeBranchImpl(MachineBasicBlock &MBB, + int *BytesRemoved, + bool DeleteFlags) const { assert(!BytesRemoved && "code size not handled"); + const X86RegisterInfo *TRI = &getRegisterInfo(); MachineBasicBlock::iterator I = MBB.end(); unsigned Count = 0; + bool FlagsDead = false; while (I != MBB.begin()) { --I; if (I->isDebugInstr()) continue; - if (I->getOpcode() != X86::JMP_1 && - X86::getCondFromBranch(*I) == X86::COND_INVALID) - break; - // Remove the branch. - I->eraseFromParent(); - I = MBB.end(); - ++Count; + if (I->getOpcode() == X86::JMP_1) { + // Remove the branch. + I->eraseFromParent(); + I = MBB.end(); + ++Count; + continue; + } + if (X86::getCondFromBranch(*I) != X86::COND_INVALID) { + if (DeleteFlags && I->killsRegister(X86::EFLAGS, TRI)) { + FlagsDead = true; + } + // Remove the branch. + I->eraseFromParent(); + I = MBB.end(); + ++Count; + continue; + } + if (!FlagsDead) + continue; + if (I->hasUnmodeledSideEffects() || I->readsRegister(X86::EFLAGS, TRI)) { + FlagsDead = false; + continue; + } + if (I->modifiesRegister(X86::EFLAGS, TRI)) { + /* This is like allDefsAreDead but ignores EFLAGS. */ + for (const MachineOperand &MO : I->operands()) { + if (MO.isReg() && MO.getReg().id() != X86::EFLAGS && !MO.isUse() && + !MO.isDead()) { + FlagsDead = false; + break; + } + } + if (FlagsDead) { + FlagsDead = false; + I->eraseFromParent(); + I = MBB.end(); + ++Count; + continue; + } + } } return Count; @@ -5576,6 +5622,115 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, return true; } +std::optional +X86InstrInfo::isZeroTest(MachineBasicBlock &MBB) const { + const X86RegisterInfo *TRI = &getRegisterInfo(); + SmallVector Cond; + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + MachineBasicBlock *Zero = nullptr, *Nonzero = nullptr; + + if (analyzeBranch(MBB, TBB, FBB, Cond, false) || Cond.size() != 1) + return std::optional(); + + switch (Cond[0].getImm()) { + case X86::COND_E: + Nonzero = FBB; + Zero = TBB; + break; + case X86::COND_NE: + Nonzero = TBB; + Zero = FBB; + break; + default: + return std::optional(); + } + MachineBasicBlock::const_reverse_instr_iterator MI = MBB.instr_rbegin(); + while (MI != MBB.instr_rend() && MI->isUnconditionalBranch()) + ++MI; + + if (MI == MBB.instr_rend() || !MI->isConditionalBranch()) + return std::optional(); + + BlockBRNZ Desc; + Desc.Zero = Zero; + Desc.Nonzero = Nonzero; + + // Only handle conditional branches that kill EFLAGS, because + // that is the common case. + // if (!MI->killsRegister(X86::EFLAGS)) + // return false; + + while (++MI != MBB.instr_rend()) { + // TEST32rr is the usual instruction to compare against zero. + if (MI->getOpcode() == X86::TEST32rr) { + const MachineOperand &op = MI->getOperand(0); + if (op.getReg() != MI->getOperand(1).getReg()) + return std::optional(); + Desc.IsKill = op.isKill(); + Desc.Regs.push_back(op.getReg()); + break; + } + // If EFLAGS is set other than by TEST32rr, fail. + // TODO: Possibly also CMP32ri8? + if (MI->modifiesRegister(X86::EFLAGS, TRI)) + return std::optional(); + } + if (Desc.Regs.size() != 1) { + return std::optional(); + } + const Register &Reg0 = Desc.Regs[0]; + + while (++MI != MBB.instr_rend()) { + if (MI->isPHI()) { + if (MI->getOperand(0).getReg() == Reg0) { + unsigned NumOperands = MI->getNumOperands(); + for (unsigned I = 1; I < NumOperands; I += 2) { + Desc.Regs.push_back(MI->getOperand(I).getReg()); + } + } + // There should be only one PHI setting the register. + return Desc; + } + if (MI->modifiesRegister(Reg0, TRI)) + return std::optional(); + if (MI->readsRegister(Reg0, TRI)) + Desc.IsKill = false; + } + return Desc; +} + +bool X86InstrInfo::isSetConstant(const MachineInstr &MI, Register &Reg, + int64_t &Value) const { + if (MI.getNumOperands() < 1) + return false; + const MachineOperand &Op0 = MI.getOperand(0); + if (!Op0.isReg()) + return false; + Reg = Op0.getReg(); + switch (MI.getOpcode()) { + case X86::MOV32r0: + Value = 0; + return true; + case X86::MOV32r1: + Value = 1; + return true; + case X86::XOR32rr: + if (MI.getOperand(1).getReg() != Reg) + return false; + Value = 0; + return true; + case X86::MOV32ri: { + const MachineOperand &Src = MI.getOperand(1); + if (!Src.isImm()) + return false; + Value = Src.getImm(); + return true; + } + default: + return false; + } +} + /// Try to remove the load by folding it to a register /// operand at the use. We fold the load instructions if load defines a virtual /// register, the virtual register is used once in the same BB, and the diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index eaa3dd089394893..476bb01a839577f 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -405,6 +405,8 @@ class X86InstrInfo final : public X86GenInstrInfo { unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved = nullptr) const override; + unsigned removeBranchAndFlags(MachineBasicBlock &MBB, + int *BytesRemoved = nullptr) const override; unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef Cond, const DebugLoc &DL, @@ -575,6 +577,10 @@ class X86InstrInfo final : public X86GenInstrInfo { Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const override; + std::optional isZeroTest(MachineBasicBlock &MBB) const override; + bool isSetConstant(const MachineInstr &MI, Register &Reg, + int64_t &Value) const override; + bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const override; @@ -651,6 +657,9 @@ class X86InstrInfo final : public X86GenInstrInfo { int FI) const override; private: + unsigned removeBranchImpl(MachineBasicBlock &MBB, int *BytesRemoved, + bool DeleteFlags) const; + /// This is a helper for convertToThreeAddress for 8 and 16-bit instructions. /// We use 32-bit LEA to form 3-address code by promoting to a 32-bit /// super-register and then truncating back down to a 8/16-bit sub-register. diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt index 7046f2f4b1d2c19..a39d11f2d2c7684 100644 --- a/llvm/lib/Transforms/CMakeLists.txt +++ b/llvm/lib/Transforms/CMakeLists.txt @@ -9,3 +9,4 @@ add_subdirectory(ObjCARC) add_subdirectory(Coroutines) add_subdirectory(CFGuard) add_subdirectory(HipStdPar) +add_subdirectory(Tapir) diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt index 92a9697720efd4e..63660eee5d0c241 100644 --- a/llvm/lib/Transforms/IPO/CMakeLists.txt +++ b/llvm/lib/Transforms/IPO/CMakeLists.txt @@ -70,6 +70,7 @@ add_llvm_component_library(LLVMipo ProfileData Scalar Support + TapirOpts TargetParser TransformUtils Vectorize diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 7b419d0f098b5c2..3537077efdd7d5d 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -43,6 +43,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/IR/PassManager.h" @@ -221,6 +222,11 @@ checkFunctionMemoryAccess(Function &F, bool ThisBody, AAResults &AAR, if (ArgMR != ModRefInfo::NoModRef) addArgLocs(ME, Call, ArgMR, AAR); continue; + } else if (isa(I) || isa(I) || isa(I)) { + // Tapir instructions only access memory accessed by other instructions in + // the function. Hence we let the other instructions determine the + // attribute of this function. + continue; } ModRefInfo MR = ModRefInfo::NoModRef; @@ -1491,6 +1497,13 @@ static bool InstrBreaksNonThrowing(Instruction &I, const SCCNodeSet &SCCNodes) { return false; if (const auto *CI = dyn_cast(&I)) { if (Function *Callee = CI->getCalledFunction()) { + // Ignore sync.unwind, detached.rethrow, and taskframe.resume when + // checking if a function can throw, since they are simply placeholders. + if (Intrinsic::sync_unwind == Callee->getIntrinsicID() || + Intrinsic::detached_rethrow == Callee->getIntrinsicID() || + Intrinsic::taskframe_resume == Callee->getIntrinsicID()) + return false; + // I is a may-throw call to a function inside our SCC. This doesn't // invalidate our current working assumption that the SCC is no-throw; we // just have to scan that other function. @@ -1691,17 +1704,60 @@ static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes, // If all of the calls in F are identifiable and are to norecurse functions, F // is norecurse. This check also detects self-recursion as F is not currently // marked norecurse, so any called from F to F will not be marked norecurse. - for (auto &BB : *F) - for (auto &I : BB.instructionsWithoutDebug()) + for (auto &BB : *F) { + for (auto &I : BB.instructionsWithoutDebug()) { if (auto *CB = dyn_cast(&I)) { Function *Callee = CB->getCalledFunction(); if (!Callee || Callee == F || (!Callee->doesNotRecurse() && !(Callee->isDeclaration() && - Callee->hasFnAttribute(Attribute::NoCallback)))) - // Function calls a potentially recursive function. - return; + Callee->hasFnAttribute(Attribute::NoCallback)))) { + if (Callee && Callee != F) { + // Ignore certain intrinsics when inferring norecurse. + switch (Callee->getIntrinsicID()) { + case Intrinsic::annotation: + case Intrinsic::assume: + case Intrinsic::sideeffect: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: + case Intrinsic::is_constant: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::objectsize: + case Intrinsic::ptr_annotation: + case Intrinsic::var_annotation: + case Intrinsic::experimental_gc_result: + case Intrinsic::experimental_gc_relocate: + case Intrinsic::coro_alloc: + case Intrinsic::coro_begin: + case Intrinsic::coro_free: + case Intrinsic::coro_end: + case Intrinsic::coro_frame: + case Intrinsic::coro_size: + case Intrinsic::coro_suspend: + case Intrinsic::coro_subfn_addr: + case Intrinsic::syncregion_start: + case Intrinsic::detached_rethrow: + case Intrinsic::taskframe_create: + case Intrinsic::taskframe_use: + case Intrinsic::taskframe_end: + case Intrinsic::taskframe_resume: + case Intrinsic::taskframe_load_guard: + case Intrinsic::sync_unwind: + continue; + default: + return; + } + } else { + // Function calls a potentially recursive function. + return; + } + } } + } + } // Every call was to a non-recursive function other than this function, and // we have no indirect recursion as the SCC size is one. This function cannot diff --git a/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp index 18d5911d10f1219..879219bfcf37206 100644 --- a/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp @@ -29,6 +29,7 @@ static bool inferAllPrototypeAttributes( if (F.isDeclaration() && !F.hasOptNone()) { if (!F.hasFnAttribute(Attribute::NoBuiltin)) Changed |= inferNonMandatoryLibFuncAttrs(F, GetTLI(F)); + Changed |= inferTapirTargetLibFuncAttributes(F, GetTLI(F)); Changed |= inferAttributesFromOthers(F); } diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp index 3ca095e1520f3bf..d1d7c09c99ad441 100644 --- a/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -826,6 +826,9 @@ PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB, if (I.isLifetimeStartOrEnd()) continue; + if (I.isTaskFrameMarker()) + continue; + if (auto *II = dyn_cast(&I)) { Intrinsic::ID IID = II->getIntrinsicID(); SmallVector Tys; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 9d2990c98ce2753..ab656820498f7e9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -68,6 +68,7 @@ #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include #include #include @@ -2983,6 +2984,127 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { })) return nullptr; break; + case Intrinsic::sync_unwind: { + // If the function does not throw, we don't need the sync.unwind. + if (II->getFunction()->doesNotThrow()) + return eraseInstFromFunction(CI); + + if (II != II->getParent()->getFirstNonPHIOrDbgOrLifetime()) { + // Check if the instruction at the start of II's block is a redundant + // sync.unwind. + const Value *SyncReg = CI.getArgOperand(0); + if (isSyncUnwind(II->getParent()->getFirstNonPHIOrDbgOrLifetime(), + SyncReg)) + return eraseInstFromFunction(CI); + } + // Check for any syncs that might use this sync.unwind. + int NumUsers = 0; + for (BasicBlock *Pred : predecessors(CI.getParent())) + if (isa(Pred->getTerminator())) { + ++NumUsers; + break; + } + // If didn't find any syncs that use this sync.unwind, remove it. + if (!NumUsers) + return eraseInstFromFunction(CI); + break; + } + case Intrinsic::syncregion_start: { + // Check for any users of this syncregion. + int NumUsers = 0; + for (User *U : II->users()) { + // Check for any Tapir instructions using this syncregion. + if (isa(U) || isa(U) || isa(U)) { + ++NumUsers; + break; + } + // Check for any Tapir intrinsics using this syncregion. + if (CallBase *CB = dyn_cast(U)) + if (isSyncUnwind(CB) || isDetachedRethrow(CB)) { + ++NumUsers; + break; + } + } + // If we have no users, it's safe to delete this syncregion. + if (!NumUsers) + return eraseInstFromFunction(CI); + break; + } + case Intrinsic::detached_rethrow: { + assert(isa(II)); + return eraseInstFromFunction(CI); + } + case Intrinsic::taskframe_use: { + // Remove a taskframe.use if it is not in a detached block. + BasicBlock *Parent = II->getParent(); + if (!Parent->getSinglePredecessor()) + return eraseInstFromFunction(CI); + + BasicBlock *Pred = Parent->getSinglePredecessor(); + if (!isa(Pred->getTerminator())) + return eraseInstFromFunction(CI); + + DetachInst *DI = cast(Pred->getTerminator()); + if (DI->getDetached() != Parent) + return eraseInstFromFunction(CI); + break; + } + case Intrinsic::taskframe_create: { + // Remove a taskframe.create if it has no uses. + int NumUsers = 0; + for (User *U : II->users()) { + if (Instruction *I = dyn_cast(U)) + if (isTapirIntrinsic(Intrinsic::taskframe_use, I) || + isTapirIntrinsic(Intrinsic::taskframe_end, I) || + isTaskFrameResume(I)) { + ++NumUsers; + break; + } + } + if (!NumUsers) + return eraseInstFromFunction(CI); + break; + } + case Intrinsic::taskframe_resume: { + assert(isa(II)); + return eraseInstFromFunction(CI); + } + case Intrinsic::tapir_runtime_end: { + Value *PrevRTStart = CI.getArgOperand(0); + // If there's a tapir.runtime.start in the same block after this + // tapir.runtime.end with no interesting instructions in between, eliminate + // both. + BasicBlock::iterator Iter(CI); + while (++Iter != CI.getParent()->end()) { + if (isTapirIntrinsic(Intrinsic::tapir_runtime_start, &*Iter)) { + // Replce the uses of the tapir.runtime.start with the argument to the + // tapir.runtime.end. + replaceInstUsesWith(*Iter, PrevRTStart); + eraseInstFromFunction(*Iter); + return eraseInstFromFunction(CI); + } + if (isa(&*Iter) && !isa(&*Iter)) + // We found a nontrivial call. Give up. + break; + } + break; + } + case Intrinsic::tapir_runtime_start: { + // If there's tapir.runtime.end in the same block after this + // tapir.runtime.start with no interesting instructions in between, + // eliminate both. + BasicBlock::iterator Iter(CI); + while (++Iter != CI.getParent()->end()) { + if (isTapirIntrinsic(Intrinsic::tapir_runtime_end, &*Iter, &CI)) { + eraseInstFromFunction(*Iter); + return eraseInstFromFunction(CI); + } + if (isa(&*Iter) && !isa(&*Iter)) + // We found a nontrivial call. Give up. + break; + } + break; + } case Intrinsic::assume: { Value *IIOperand = II->getArgOperand(0); SmallVector OpBundles; diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 0d8e7e92c5c8e53..4f4f8e7843f0a21 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -4100,6 +4100,7 @@ static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) { case EHPersonality::Wasm_CXX: case EHPersonality::XL_CXX: case EHPersonality::ZOS_CXX: + case EHPersonality::Cilk_CXX: return TypeInfo->isNullValue(); } llvm_unreachable("invalid enum"); @@ -4765,6 +4766,11 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I, // successor block. if (DestBlock->getUniquePredecessor() != I->getParent()) return false; + // We can't generally move an instruction that reads from memory past a + // detach or reattach. + if (isa(I->getParent()->getTerminator()) || + isa(I->getParent()->getTerminator())) + return false; for (BasicBlock::iterator Scan = std::next(I->getIterator()), E = I->getParent()->end(); Scan != E; ++Scan) @@ -5065,6 +5071,10 @@ bool InstCombinerImpl::run() { // Make sure these checks are done only once, naturally we do the checks // the first time we get the userparent, this will save compile time. if (NumUsers == 0) { + // Don't sink if the successor follows through a sync instruction. + if (isa(BB->getTerminator())) + return std::nullopt; + // Try sinking to another block. If that block is unreachable, then do // not bother. SimplifyCFG should handle it. if (UserParent == BB || !DT.isReachableFromEntry(UserParent)) diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 9fb1df7ab2b79c2..eec38d4e19728ae 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/StackSafetyAnalysis.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/BinaryFormat/MachO.h" @@ -713,7 +714,7 @@ class RuntimeCallInserter { /// AddressSanitizer: instrument the code in module to find memory bugs. struct AddressSanitizer { - AddressSanitizer(Module &M, const StackSafetyGlobalInfo *SSGI, + AddressSanitizer(Module &M, const StackSafetyGlobalInfo *SSGI, TaskInfo *TI, int InstrumentationWithCallsThreshold, uint32_t MaxInlinePoisoningSize, bool CompileKernel = false, bool Recover = false, bool UseAfterScope = false, @@ -725,7 +726,7 @@ struct AddressSanitizer { UseAfterScope(UseAfterScope || ClUseAfterScope), UseAfterReturn(ClUseAfterReturn.getNumOccurrences() ? ClUseAfterReturn : UseAfterReturn), - SSGI(SSGI), + SSGI(SSGI), TI(TI), InstrumentationWithCallsThreshold( ClInstrumentationWithCallsThreshold.getNumOccurrences() > 0 ? ClInstrumentationWithCallsThreshold @@ -799,6 +800,7 @@ struct AddressSanitizer { bool maybeInsertAsanInitAtFunctionEntry(Function &F); bool maybeInsertDynamicShadowAtFunctionEntry(Function &F); void markEscapedLocalAllocas(Function &F); + void recordInterestingParallelAllocas(const Function &F); private: friend struct FunctionStackPoisoner; @@ -842,6 +844,9 @@ struct AddressSanitizer { FunctionCallee AsanPtrCmpFunction, AsanPtrSubFunction; Constant *AsanShadowGlobal; + // Analyses + TaskInfo *TI; + // These arrays is indexed by AccessIsWrite, Experiment and log2(AccessSize). FunctionCallee AsanErrorCallback[2][2][kNumberOfAccessSizes]; FunctionCallee AsanMemoryAccessCallback[2][2][kNumberOfAccessSizes]; @@ -854,6 +859,7 @@ struct AddressSanitizer { Value *LocalDynamicShadow = nullptr; const StackSafetyGlobalInfo *SSGI; DenseMap ProcessedAllocas; + SmallPtrSet InterestingParallelAllocas; FunctionCallee AMDGPUAddressShared; FunctionCallee AMDGPUAddressPrivate; @@ -1259,8 +1265,11 @@ PreservedAnalyses AddressSanitizerPass::run(Module &M, const StackSafetyGlobalInfo *const SSGI = ClUseStackSafety ? &MAM.getResult(M) : nullptr; for (Function &F : M) { + TaskInfo *TI = nullptr; + if (!F.empty()) + TI = &FAM.getResult(F); AddressSanitizer FunctionSanitizer( - M, SSGI, Options.InstrumentationWithCallsThreshold, + M, SSGI, TI, Options.InstrumentationWithCallsThreshold, Options.MaxInlinePoisoningSize, Options.CompileKernel, Options.Recover, Options.UseAfterScope, Options.UseAfterReturn); const TargetLibraryInfo &TLI = FAM.getResult(F); @@ -1361,6 +1370,8 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) { // We are only interested in allocas not promotable to registers. // Promotable allocas are common under -O0. (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) && + (!ClSkipPromotableAllocas || + (TI->isSerial() || InterestingParallelAllocas.contains(&AI))) && // inalloca allocas are not treated as static, and we don't want // dynamic alloca instrumentation for them as well. !AI.isUsedWithInAlloca() && @@ -2932,6 +2943,21 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) { } } +void AddressSanitizer::recordInterestingParallelAllocas(const Function &F) { + if (!ClSkipPromotableAllocas || TI->isSerial()) + return; + + for (const BasicBlock &BB : F) + for (const Instruction &I : BB) + if (const AllocaInst *AI = dyn_cast(&I)) + if (AI->getAllocatedType()->isSized() && + ((!AI->isStaticAlloca()) || getAllocaSizeInBytes(*AI) > 0) && + // We are only interested in allocas not promotable to registers. + // Promotable allocas are common under -O0. + !isAllocaPromotable(AI) && !TI->isAllocaParallelPromotable(AI)) + InterestingParallelAllocas.insert(AI); +} + bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) { bool ShouldInstrument = ClDebugMin < 0 || ClDebugMax < 0 || @@ -2978,6 +3004,10 @@ bool AddressSanitizer::instrumentFunction(Function &F, // can be passed to that intrinsic. markEscapedLocalAllocas(F); + // Record all interesting parallel allocas, using TaskInfo analysis before + // instrumentation may disrupt the validity of the analysis. + recordInterestingParallelAllocas(F); + // We want to instrument every address only once per basic block (unless there // are calls between uses). SmallPtrSet TempsToInstrument; diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt index 4e3f9e27e0c3446..a18a4200d6078d7 100644 --- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt @@ -3,6 +3,7 @@ add_llvm_component_library(LLVMInstrumentation BoundsChecking.cpp CGProfile.cpp ControlHeightReduction.cpp + CilkSanitizer.cpp DataFlowSanitizer.cpp GCOVProfiling.cpp BlockCoverageInference.cpp @@ -25,6 +26,8 @@ add_llvm_component_library(LLVMInstrumentation ValueProfileCollector.cpp ThreadSanitizer.cpp HWAddressSanitizer.cpp + ComprehensiveStaticInstrumentation.cpp + SurgicalInstrumentationConfig.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms @@ -36,6 +39,8 @@ add_llvm_component_library(LLVMInstrumentation Analysis Core Demangle + IRReader + Linker MC Support TargetParser diff --git a/llvm/lib/Transforms/Instrumentation/CilkSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/CilkSanitizer.cpp new file mode 100644 index 000000000000000..011007d82e3fa0a --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/CilkSanitizer.cpp @@ -0,0 +1,4812 @@ +//===- CilkSanitizer.cpp - Nondeterminism detector for Cilk/Tapir ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is a part of CilkSan, a determinacy-race detector for Cilk +// programs. +// +// This instrumentation pass inserts calls to the runtime library before +// appropriate memory accesses. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation/CilkSanitizer.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/MustExecute.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirRaceDetect.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/Support/ModRef.h" +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Instrumentation/CSI.h" +#include "llvm/Transforms/IPO/FunctionAttrs.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/EscapeEnumerator.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "cilksan" + +STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); +STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); +STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size"); +STATISTIC(NumOmittedReadsBeforeWrite, + "Number of reads ignored due to following writes"); +STATISTIC(NumOmittedReadsFromConstants, "Number of reads from constant data"); +STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing"); +STATISTIC(NumInstrumentedMemIntrinsicReads, + "Number of instrumented reads from memory intrinsics"); +STATISTIC(NumInstrumentedMemIntrinsicWrites, + "Number of instrumented writes from memory intrinsics"); +STATISTIC(NumInstrumentedDetaches, "Number of instrumented detaches"); +STATISTIC(NumInstrumentedDetachExits, "Number of instrumented detach exits"); +STATISTIC(NumInstrumentedSyncs, "Number of instrumented syncs"); +STATISTIC(NumInstrumentedAllocas, "Number of instrumented allocas"); +STATISTIC(NumInstrumentedAllocFns, + "Number of instrumented allocation functions"); +STATISTIC(NumInstrumentedFrees, "Number of instrumented free calls"); +STATISTIC( + NumHoistedInstrumentedReads, + "Number of reads whose instrumentation has been coalesced and hoisted"); +STATISTIC( + NumHoistedInstrumentedWrites, + "Number of writes whose instrumentation has been coalesced and hoisted"); +STATISTIC(NumSunkInstrumentedReads, + "Number of reads whose instrumentation has been coalesced and sunk"); +STATISTIC(NumSunkInstrumentedWrites, + "Number of writes whose instrumentation has been coalesced and sunk"); + +static cl::opt + EnableStaticRaceDetection( + "enable-static-race-detection", cl::init(true), cl::Hidden, + cl::desc("Enable static detection of determinacy races.")); + +static cl::opt + AssumeRaceFreeLibraryFunctions( + "assume-race-free-lib", cl::init(false), cl::Hidden, + cl::desc("Assume library functions are race free.")); + +static cl::opt + IgnoreInaccessibleMemory( + "ignore-inaccessible-memory", cl::init(false), cl::Hidden, + cl::desc("Ignore inaccessible memory when checking for races.")); + +static cl::opt + AssumeNoExceptions( + "cilksan-assume-no-exceptions", cl::init(false), cl::Hidden, + cl::desc("Assume that ordinary calls cannot throw exceptions.")); + +static cl::opt + MaxUsesToExploreCapture( + "cilksan-max-uses-to-explore-capture", cl::init(unsigned(-1)), + cl::Hidden, + cl::desc("Maximum number of uses to explore for a capture query.")); + +static cl::opt MAAPChecks("cilksan-maap-checks", cl::init(true), + cl::Hidden, + cl::desc("Enable or disable MAAP checks.")); + +static cl::opt LoopHoisting( + "cilksan-loop-hoisting", cl::init(true), cl::Hidden, + cl::desc("Enable or disable hoisting instrumentation out of loops.")); + +static cl::opt + IgnoreSanitizeCilkAttr( + "ignore-sanitize-cilk-attr", cl::init(false), cl::Hidden, + cl::desc("Ignore the 'sanitize_cilk' attribute when choosing what to " + "instrument.")); + +static cl::opt ClCilksanBCPath( + "cilksan-bc-path", cl::init(""), cl::Hidden, + cl::desc("Path to the bitcode file for the Cilksan library.")); + +static const unsigned SERIESPARALLEL = 0x1; +static const unsigned SHADOWMEMORY = 0x2; +static cl::opt InstrumentationSet( + "cilksan-instrumentation-set", cl::init(SERIESPARALLEL | SHADOWMEMORY), + cl::Hidden, + cl::desc("Specify the set of instrumentation hooks to insert.")); + +static const char *const CsanRtUnitInitName = "__csanrt_unit_init"; +static const char *const CsiUnitObjTableName = "__csi_unit_obj_table"; +static const char *const CsiUnitObjTableArrayName = "__csi_unit_obj_tables"; + +/// Maintains a mapping from CSI ID of a load or store to the source information +/// of the object accessed by that load or store. +class ObjectTable : public ForensicTable { +public: + ObjectTable() : ForensicTable() {} + ObjectTable(Module &M, StringRef BaseIdName) : ForensicTable(M, BaseIdName) {} + + /// The number of entries in this table + uint64_t size() const { return LocalIdToSourceLocationMap.size(); } + + /// Add the given instruction to this table. + /// \returns The local ID of the Instruction. + uint64_t add(Instruction &I, Value *Obj); + + /// Get the Type for a pointer to a table entry. + /// + /// A table entry is just a source location. + static PointerType *getPointerType(LLVMContext &C); + + /// Insert this table into the given Module. + /// + /// The table is constructed as a ConstantArray indexed by local IDs. The + /// runtime is responsible for performing the mapping that allows the table to + /// be indexed by global ID. + Constant *insertIntoModule(Module &M) const; + +private: + struct SourceLocation { + StringRef Name; + int32_t Line; + StringRef Filename; + StringRef Directory; + }; + + /// Map of local ID to SourceLocation. + DenseMap LocalIdToSourceLocationMap; + + /// Create a struct type to match the "struct SourceLocation" type. + /// (and the source_loc_t type in csi.h). + static StructType *getSourceLocStructType(LLVMContext &C); + + /// Append the line and file information to the table. + void add(uint64_t ID, int32_t Line = -1, + StringRef Filename = "", StringRef Directory = "", + StringRef Name = ""); +}; + +namespace { +struct CilkSanitizerImpl : public CSIImpl { + // Class to manage inserting instrumentation without static race detection. + class SimpleInstrumentor { + public: + SimpleInstrumentor(CilkSanitizerImpl &CilkSanImpl, TaskInfo &TI, + LoopInfo &LI, DominatorTree &DT, + const TargetLibraryInfo *TLI) + : CilkSanImpl(CilkSanImpl), TI(TI), LI(LI), DT(DT), + DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy), TLI(TLI) {} + + bool InstrumentSimpleInstructions( + SmallVectorImpl &Instructions); + bool InstrumentAnyMemIntrinsics( + SmallVectorImpl &MemIntrinsics); + bool InstrumentCalls(SmallVectorImpl &Calls); + bool InstrumentAncillaryInstructions( + SmallPtrSetImpl &Allocas, + SmallPtrSetImpl &AllocationFnCalls, + SmallPtrSetImpl &FreeCalls, + DenseMap &SyncRegNums, + DenseMap &SRCounters, const DataLayout &DL); + + private: + void getDetachesForInstruction(Instruction *I); + + CilkSanitizerImpl &CilkSanImpl; + TaskInfo &TI; + LoopInfo &LI; + DominatorTree &DT; + DomTreeUpdater DTU; + const TargetLibraryInfo *TLI; + + SmallPtrSet Detaches; + }; + + // Class to manage inserting instrumentation with static race detection. + class Instrumentor { + public: + Instrumentor(CilkSanitizerImpl &CilkSanImpl, RaceInfo &RI, TaskInfo &TI, + LoopInfo &LI, DominatorTree &DT, const TargetLibraryInfo *TLI) + : CilkSanImpl(CilkSanImpl), RI(RI), TI(TI), LI(LI), DT(DT), + DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy), TLI(TLI) {} + + void InsertArgMAAPs(Function &F, Value *FuncId); + bool InstrumentSimpleInstructions( + SmallVectorImpl &Instructions); + bool InstrumentAnyMemIntrinsics( + SmallVectorImpl &MemIntrinsics); + bool InstrumentCalls(SmallVectorImpl &Calls); + void GetDetachesForCoalescedInstrumentation( + SmallPtrSetImpl &LoopInstToHoist, + SmallPtrSetImpl &LoopInstToSink); + bool InstrumentAncillaryInstructions( + SmallPtrSetImpl &Allocas, + SmallPtrSetImpl &AllocationFnCalls, + SmallPtrSetImpl &FreeCalls, + DenseMap &SyncRegNums, + DenseMap &SRCounters, const DataLayout &DL); + bool InstrumentLoops(SmallPtrSetImpl &LoopInstToHoist, + SmallPtrSetImpl &LoopInstToSink, + SmallPtrSetImpl &TapirLoops, + ScalarEvolution *); + bool PerformDelayedInstrumentation(); + + private: + void getDetachesForInstruction(Instruction *I); + // A MAAP (May Access Alias in Parallel) encodes static information about + // memory access that may result in a race, in order to propagate that + // information dynamically at runtime. In particular, a MAAP for a pointer + // argument to a called function communicates to the callee whether the + // caller or some ancestor may read or write the referenced memory in + // parallel and whether the caller can provide any noalias guarantee on that + // memory location. + enum class MAAPValue : uint8_t + { + NoAccess = 0, + Mod = 1, + Ref = 2, + ModRef = Mod | Ref, + NoAlias = 4, + }; + static unsigned RaceTypeToFlagVal(RaceInfo::RaceType RT); + // Get the MAAP value for specific instruction and operand. + Value *getMAAPValue(Instruction *I, IRBuilder<> &IRB, + unsigned OperandNum = static_cast(-1), + MAAPValue DefaultMV = MAAPValue::ModRef, + bool CheckArgs = true); + // Helper method to determine noalias MAAP bit. + Value *getNoAliasMAAPValue(Instruction *I, IRBuilder<> &IRB, + unsigned OperandNum, MemoryLocation Loc, + const RaceInfo::RaceData &RD, + const Value *Obj, Value *MAAPVal); + // Synthesize a check of the MAAP to determine whether the MAAP means we can + // skip executing instrumentation for the given instruction. + Value *getMAAPCheck(Instruction *I, IRBuilder<> &IRB, + unsigned OperandNum = static_cast(-1)); + // Helper method to read a MAAP value. + Value *readMAAPVal(Value *V, IRBuilder<> &IRB); + + CilkSanitizerImpl &CilkSanImpl; + RaceInfo &RI; + TaskInfo &TI; + LoopInfo &LI; + DominatorTree &DT; + DomTreeUpdater DTU; + const TargetLibraryInfo *TLI; + + SmallPtrSet Detaches; + + DenseMap LocalMAAPs; + SmallPtrSet ArgMAAPs; + + SmallVector DelayedSimpleInsts; + SmallVector, 8> DelayedMemIntrinsics; + SmallVector DelayedCalls; + }; + + // TODO: With recent changes in LLVM's JIT technology, the JitMode option + // no longer seems to be necessary. + CilkSanitizerImpl(Module &M, CallGraph *CG, + function_ref GetDomTree, + function_ref GetTaskInfo, + function_ref GetLoopInfo, + function_ref GetRaceInfo, + function_ref GetTLI, + function_ref GetSE, + // function_ref GetTTI, + bool CallsMayThrow = !AssumeNoExceptions, + bool JitMode = false) + : CSIImpl(M, CG, GetDomTree, GetLoopInfo, GetTaskInfo, GetTLI, GetSE, + nullptr), + GetRaceInfo(GetRaceInfo) { + // Even though we're doing our own instrumentation, we want the CSI setup + // for the instrumentation of function entry/exit, memory accesses (i.e., + // loads and stores), atomics, memory intrinsics. We also want call sites, + // for extracting debug information. + Options.InstrumentBasicBlocks = false; + Options.InstrumentLoops = true; + // Cilksan defines its own hooks for instrumenting memory accesses, memory + // intrinsics, and Tapir instructions, so we disable the default CSI + // instrumentation hooks for these IR objects. + Options.InstrumentMemoryAccesses = false; + Options.InstrumentMemIntrinsics = false; + Options.InstrumentTapir = false; + Options.InstrumentCalls = false; + Options.jitMode = JitMode; + Options.CallsMayThrow = CallsMayThrow; + } + bool setup(bool NeedToSetupCalls); + bool run(); + + static StructType *getUnitObjTableType(LLVMContext &C, + PointerType *EntryPointerType); + static Constant *objTableToUnitObjTable(Module &M, + StructType *UnitObjTableType, + ObjectTable &ObjTable); + static bool isLibCall(const Instruction &I, const TargetLibraryInfo *TLI); + static bool simpleCallCannotRace(const Instruction &I); + static bool shouldIgnoreCall(const Instruction &I); + + static DebugLoc searchForDebugLoc(Instruction *I) { + if (DebugLoc Loc = I->getDebugLoc()) + return Loc; + + // Try to find debug information later in this block. + BasicBlock::iterator BI = I->getIterator(); + BasicBlock::const_iterator BE(I->getParent()->end()); + while (BI != BE) { + if (DebugLoc Loc = BI->getDebugLoc()) { + return Loc; + } + ++BI; + } + + // Try to find debug information earlier in this block. + BI = I->getIterator(); + BasicBlock::const_iterator BB(I->getParent()->begin()); + while (BI != BB) { + --BI; + if (DebugLoc Loc = BI->getDebugLoc()) { + return Loc; + } + } + + return I->getDebugLoc(); + } + + void setupBlocks(Function &F, DominatorTree *DT = nullptr, + LoopInfo *LI = nullptr); + bool setupFunction(Function &F, bool NeedToSetupCalls); + + FunctionCallee getHookFunction(StringRef Name, FunctionType *FnTy, + AttributeList AL) { + FunctionCallee Callee = M.getOrInsertFunction(Name, FnTy, AL); + if (Function *Fn = dyn_cast(Callee.getCallee())) { + Fn->setOnlyAccessesInaccessibleMemOrArgMem(); + Fn->setDoesNotThrow(); + } + return Callee; + } + template + FunctionCallee getHookFunction(StringRef Name, AttributeList AL, Type *RetTy, + ArgsTy... Args) { + FunctionCallee Callee = M.getOrInsertFunction(Name, AL, RetTy, Args...); + if (Function *Fn = dyn_cast(Callee.getCallee())) { + MemoryEffects ME = MemoryEffects::argMemOnly(ModRefInfo::Ref) | + MemoryEffects::inaccessibleMemOnly(ModRefInfo::ModRef); + Fn->setMemoryEffects(ME); + Fn->setDoesNotThrow(); + } + return Callee; + } + template + FunctionCallee getHookFunction(StringRef Name, Type *RetTy, + ArgsTy... Args) { + return getHookFunction(Name, AttributeList{}, RetTy, Args...); + } + + // Methods for handling FED tables + void initializeFEDTables() {} + void collectUnitFEDTables() {} + + // Methods for handling object tables + void initializeCsanObjectTables(); + void collectUnitObjectTables(); + + // Create a call to the runtime unit initialization routine in a global + // constructor. + CallInst *createRTUnitInitCall(IRBuilder<> &IRB) override; + + // Initialize custom hooks for CilkSanitizer + void initializeCsanHooks(); + + Value *GetCalleeFuncID(const Function *Callee, IRBuilder<> &IRB); + + // Helper function for prepareToInstrumentFunction that chooses loads and + // stores in a basic block to instrument. + void chooseInstructionsToInstrument(SmallVectorImpl &Local, + SmallVectorImpl &All, + const TaskInfo &TI, LoopInfo &LI, + const TargetLibraryInfo *TLI); + + // Helper methods for instrumenting different IR objects. + bool instrumentLoadOrStore(Instruction *I, IRBuilder<> &IRB); + bool instrumentLoadOrStore(Instruction *I) { + IRBuilder<> IRB(I); + if (!IRB.getCurrentDebugLocation()) + IRB.SetCurrentDebugLocation(searchForDebugLoc(I)); + return instrumentLoadOrStore(I, IRB); + } + bool instrumentAtomic(Instruction *I, IRBuilder<> &IRB); + bool instrumentAtomic(Instruction *I) { + IRBuilder<> IRB(I); + if (!IRB.getCurrentDebugLocation()) + IRB.SetCurrentDebugLocation(searchForDebugLoc(I)); + return instrumentAtomic(I, IRB); + } + bool instrumentIntrinsicCall(Instruction *I, + SmallVectorImpl *MAAPVals = nullptr); + bool instrumentLibCall(Instruction *I, + SmallVectorImpl *MAAPVals = nullptr); + bool instrumentCallsite(Instruction *I, + SmallVectorImpl *MAAPVals = nullptr); + bool suppressCallsite(Instruction *I); + bool instrumentAllocFnLibCall(Instruction *I, const TargetLibraryInfo *TLI); + bool instrumentAllocationFn(Instruction *I, DominatorTree &DT, + const TargetLibraryInfo *TLI); + bool instrumentFree(Instruction *I, const TargetLibraryInfo *TLI); + bool instrumentDetach(DetachInst *DI, unsigned SyncRegNum, + unsigned NumSyncRegs, DominatorTree &DT, TaskInfo &TI, + LoopInfo &LI); + bool instrumentSync(SyncInst *SI, unsigned SyncRegNum); + void instrumentTapirLoop(Loop &L, TaskInfo &TI, + DenseMap &SyncRegNums, + ScalarEvolution *SE = nullptr); + bool instrumentAlloca(Instruction *I, TaskInfo &TI); + + bool instrumentFunctionUsingRI(Function &F); + // Helper method for RI-based race detection for instrumenting an access by a + // memory intrinsic. + bool instrumentAnyMemIntrinAcc(Instruction *I, unsigned OperandNum, + IRBuilder<> &IRB); + bool instrumentAnyMemIntrinAcc(Instruction *I, unsigned OperandNum) { + IRBuilder<> IRB(I); + if (!IRB.getCurrentDebugLocation()) + IRB.SetCurrentDebugLocation(searchForDebugLoc(I)); + return instrumentAnyMemIntrinAcc(I, OperandNum, IRB); + } + + bool instrumentLoadOrStoreHoisted(Instruction *I, + Value *Addr, + Value *RangeVal, + IRBuilder<> &IRB, + uint64_t LocalId); + +private: + // Analysis results + function_ref GetRaceInfo; + + // Instrumentation hooks + FunctionCallee CsanFuncEntry = nullptr; + FunctionCallee CsanFuncExit = nullptr; + FunctionCallee CsanRead = nullptr; + FunctionCallee CsanWrite = nullptr; + FunctionCallee CsanLargeRead = nullptr; + FunctionCallee CsanLargeWrite = nullptr; + FunctionCallee CsanBeforeCallsite = nullptr; + FunctionCallee CsanAfterCallsite = nullptr; + FunctionCallee CsanDetach = nullptr; + FunctionCallee CsanDetachContinue = nullptr; + FunctionCallee CsanTaskEntry = nullptr; + FunctionCallee CsanTaskExit = nullptr; + FunctionCallee CsanSync = nullptr; + FunctionCallee CsanBeforeLoop = nullptr; + FunctionCallee CsanAfterLoop = nullptr; + FunctionCallee CsanAfterAllocFn = nullptr; + FunctionCallee CsanAfterFree = nullptr; + + // Hooks for suppressing instrumentation, e.g., around callsites that cannot + // expose a race. + FunctionCallee CsanDisableChecking = nullptr; + FunctionCallee CsanEnableChecking = nullptr; + + FunctionCallee GetMAAP = nullptr; + FunctionCallee SetMAAP = nullptr; + + // CilkSanitizer custom forensic tables + ObjectTable LoadObj, StoreObj, AllocaObj, AllocFnObj; + + SmallVector UnitObjTables; + + SmallVector AllocationFnCalls; + SmallVector FreeCalls; + SmallVector Allocas; + SmallPtrSet ToInstrument; + + // Map of functions to updated race type, for interprocedural analysis of + // races. + DenseMap FunctionRaceType; + DenseMap ObjectMRForRace; + + DenseMap> DetachToSync; + + bool LocalBaseObj(const Value *Addr, LoopInfo *LI, + const TargetLibraryInfo *TLI) const; + bool PossibleRaceByCapture(const Value *Addr, const TaskInfo &TI, + LoopInfo *LI) const; + bool unknownObjectUses(const Value *Addr, LoopInfo *LI, + const TargetLibraryInfo *TLI) const; + + // Cached results of calls to getUnderlyingObjects. + using BaseObjMapTy = + DenseMap>; + mutable BaseObjMapTy BaseObjects; + SmallVectorImpl &lookupBaseObjects(const Value *Addr, + LoopInfo *LI) const { + if (!BaseObjects.count(Addr)) { + if (isa(Addr)) + BaseObjects.lookup(Addr); + else + getUnderlyingObjects(Addr, BaseObjects[Addr], LI, 0); + } + return BaseObjects[Addr]; + } + + bool MightHaveDetachedUse(const Value *Addr, const TaskInfo &TI) const; + // // Cached results of calls to MightHaveDetachedUse. + // using DetachedUseMapTy = DenseMap; + // mutable DetachedUseMapTy DetachedUseCache; + bool lookupMightHaveDetachedUse(const Value *Addr, const TaskInfo &TI) const { + return MightHaveDetachedUse(Addr, TI); + // if (!DetachedUseCache.count(Addr)) + // DetachedUseCache[Addr] = MightHaveDetachedUse(Addr, TI); + // return DetachedUseCache[Addr]; + } + + // Cached results of calls to PointerMayBeCaptured. + using MayBeCapturedMapTy = DenseMap; + mutable MayBeCapturedMapTy MayBeCapturedCache; + bool lookupPointerMayBeCaptured(const Value *Ptr) const { + if (!Ptr->getType()->isPointerTy()) + return false; + + if (!MayBeCapturedCache.count(Ptr)) { + if (isa(Ptr)) + MayBeCapturedCache.lookup(Ptr); + else + MayBeCapturedCache[Ptr] = PointerMayBeCaptured(Ptr, true, false, + MaxUsesToExploreCapture); + } + return MayBeCapturedCache[Ptr]; + } + + FunctionCallee getOrInsertSynthesizedHook(StringRef Name, FunctionType *T, + AttributeList AL = AttributeList()); +}; + +/// CilkSanitizer: instrument the code in module to find races. +struct CilkSanitizerLegacyPass : public ModulePass { + static char ID; // Pass identification, replacement for typeid. + CilkSanitizerLegacyPass(bool CallsMayThrow = !AssumeNoExceptions, + bool JitMode = false) + : ModulePass(ID), JitMode(JitMode), CallsMayThrow(CallsMayThrow) { + initializeCilkSanitizerLegacyPassPass(*PassRegistry::getPassRegistry()); + } + StringRef getPassName() const override { return "CilkSanitizer"; } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnModule(Module &M) override; + + bool JitMode = false; + bool CallsMayThrow = true; +}; +} // end anonymous namespace + +char CilkSanitizerLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN( + CilkSanitizerLegacyPass, "csan", + "CilkSanitizer: detects determinacy races in Cilk programs.", + false, false) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TapirRaceDetectWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END( + CilkSanitizerLegacyPass, "csan", + "CilkSanitizer: detects determinacy races in Cilk programs.", + false, false) + +void CilkSanitizerLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); +} + +uint64_t ObjectTable::add(Instruction &I, Value *Obj) { + uint64_t ID = getId(&I); + if (isa(Obj)) { + add(ID, -1, "", "", "(undef)"); + return ID; + } + + // First, if the underlying object is a global variable, get that variable's + // debug information. + if (GlobalVariable *GV = dyn_cast(Obj)) { + SmallVector DbgGVExprs; + GV->getDebugInfo(DbgGVExprs); + for (auto *GVE : DbgGVExprs) { + auto *DGV = GVE->getVariable(); + if (DGV->getName() != "") { + add(ID, DGV->getLine(), DGV->getFilename(), DGV->getDirectory(), + DGV->getName()); + return ID; + } + } + add(ID, -1, "", "", Obj->getName()); + return ID; + } + + // Otherwise, if the underlying object is a function, get that function's + // debug information. + if (Function *F = dyn_cast(Obj)) { + if (DISubprogram *SP = F->getSubprogram()) { + add(ID, SP->getLine(), SP->getFilename(), SP->getDirectory(), + SP->getName()); + return ID; + } + add(ID, -1, "", "", Obj->getName()); + return ID; + } + + // Next, if this is an alloca instruction, look for a llvm.dbg.declare + // intrinsic. + if (AllocaInst *AI = dyn_cast(Obj)) { + TinyPtrVector DbgDeclares = FindDbgDeclareUses(AI); + if (!DbgDeclares.empty()) { + auto *LV = DbgDeclares.front()->getVariable(); + add(ID, LV->getLine(), LV->getFilename(), LV->getDirectory(), + LV->getName()); + return ID; + } + } + + // Otherwise just examine the llvm.dbg.value intrinsics for this object. + SmallVector DbgValues; + findDbgValues(DbgValues, Obj); + for (auto *DVI : DbgValues) { + auto *LV = DVI->getVariable(); + if (LV->getName() != "") { + add(ID, LV->getLine(), LV->getFilename(), LV->getDirectory(), + LV->getName()); + return ID; + } + } + + add(ID, -1, "", "", Obj->getName()); + return ID; +} + +PointerType *ObjectTable::getPointerType(LLVMContext &C) { + return PointerType::get(getSourceLocStructType(C), 0); +} + +StructType *ObjectTable::getSourceLocStructType(LLVMContext &C) { + return StructType::get( + /* Name */ PointerType::get(IntegerType::get(C, 8), 0), + /* Line */ IntegerType::get(C, 32), + /* File */ PointerType::get(IntegerType::get(C, 8), 0)); +} + +void ObjectTable::add(uint64_t ID, int32_t Line, + StringRef Filename, StringRef Directory, + StringRef Name) { + assert(LocalIdToSourceLocationMap.find(ID) == + LocalIdToSourceLocationMap.end() && + "Id already exists in FED table."); + LocalIdToSourceLocationMap[ID] = {Name, Line, Filename, Directory}; +} + +// The order of arguments to ConstantStruct::get() must match the +// obj_source_loc_t type in csan.h. +static void addObjTableEntries(SmallVectorImpl &TableEntries, + StructType *TableType, Constant *Name, + Constant *Line, Constant *File) { + TableEntries.push_back(ConstantStruct::get(TableType, Name, Line, File)); +} + +Constant *ObjectTable::insertIntoModule(Module &M) const { + LLVMContext &C = M.getContext(); + StructType *TableType = getSourceLocStructType(C); + IntegerType *Int32Ty = IntegerType::get(C, 32); + Constant *Zero = ConstantInt::get(Int32Ty, 0); + Value *GepArgs[] = {Zero, Zero}; + SmallVector TableEntries; + + // Get the object-table entries for each ID. + for (uint64_t LocalID = 0; LocalID < IdCounter; ++LocalID) { + const SourceLocation &E = LocalIdToSourceLocationMap.find(LocalID)->second; + // Source line + Constant *Line = ConstantInt::get(Int32Ty, E.Line); + // Source file + Constant *File; + { + std::string Filename = E.Filename.str(); + if (!E.Directory.empty()) + Filename = E.Directory.str() + "/" + Filename; + File = getObjectStrGV(M, Filename, "__csi_unit_filename_"); + } + // Variable name + Constant *Name = getObjectStrGV(M, E.Name, "__csi_unit_object_name_"); + + // Add entry to the table + addObjTableEntries(TableEntries, TableType, Name, Line, File); + } + + ArrayType *TableArrayType = ArrayType::get(TableType, TableEntries.size()); + Constant *Table = ConstantArray::get(TableArrayType, TableEntries); + GlobalVariable *GV = + new GlobalVariable(M, TableArrayType, false, GlobalValue::InternalLinkage, + Table, CsiUnitObjTableName); + return ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs); +} + +namespace { + +using SCCNodeSet = SmallSetVector; + +} // end anonymous namespace + +bool CilkSanitizerImpl::setup(bool NeedToSetupCalls) { + // Setup functions for instrumentation. + for (scc_iterator I = scc_begin(CG); !I.isAtEnd(); ++I) { + const std::vector &SCC = *I; + for (CallGraphNode *N : SCC) + if (Function *F = N->getFunction()) + setupFunction(*F, NeedToSetupCalls); + } + return true; +} + +bool CilkSanitizerImpl::run() { + // Link the tool bitcode once initially, to get type definitions. + linkInToolFromBitcode(ClCilksanBCPath); + // Initialize components of the CSI and Cilksan system. + initializeCsi(); + initializeFEDTables(); + initializeCsanObjectTables(); + initializeCsanHooks(); + + // Evaluate the SCC's in the callgraph in post order to support + // interprocedural analysis of potential races in the module. + SmallVector InstrumentedFunctions; + + // Instrument functions. + for (scc_iterator I = scc_begin(CG); !I.isAtEnd(); ++I) { + const std::vector &SCC = *I; + for (CallGraphNode *N : SCC) { + if (Function *F = N->getFunction()) + if (instrumentFunctionUsingRI(*F)) + InstrumentedFunctions.push_back(F); + } + } + // After all functions have been analyzed and instrumented, update their + // attributes. + for (Function *F : InstrumentedFunctions) { + updateInstrumentedFnAttrs(*F); + F->removeFnAttr(Attribute::SanitizeCilk); + } + + CSIImpl::collectUnitFEDTables(); + collectUnitFEDTables(); + collectUnitObjectTables(); + finalizeCsi(); + + // Link the tool bitcode a second time, for definitions of used functions. + linkInToolFromBitcode(ClCilksanBCPath); + return true; +} + +void CilkSanitizerImpl::initializeCsanObjectTables() { + LoadObj = ObjectTable(M, CsiLoadBaseIdName); + StoreObj = ObjectTable(M, CsiStoreBaseIdName); + AllocaObj = ObjectTable(M, CsiAllocaBaseIdName); + AllocFnObj = ObjectTable(M, CsiAllocFnBaseIdName); +} + +// Create a struct type to match the unit_obj_entry_t type in csanrt.c. +StructType *CilkSanitizerImpl::getUnitObjTableType( + LLVMContext &C, PointerType *EntryPointerType) { + return StructType::get(IntegerType::get(C, 64), EntryPointerType); +} + +Constant *CilkSanitizerImpl::objTableToUnitObjTable( + Module &M, StructType *UnitObjTableType, ObjectTable &ObjTable) { + Constant *NumEntries = + ConstantInt::get(IntegerType::get(M.getContext(), 64), ObjTable.size()); + // Constant *BaseIdPtr = + // ConstantExpr::getPointerCast(FedTable.baseId(), + // Type::getInt8PtrTy(M.getContext(), 0)); + Constant *InsertedTable = ObjTable.insertIntoModule(M); + return ConstantStruct::get(UnitObjTableType, NumEntries, + InsertedTable); +} + +void CilkSanitizerImpl::collectUnitObjectTables() { + LLVMContext &C = M.getContext(); + StructType *UnitObjTableType = + getUnitObjTableType(C, ObjectTable::getPointerType(C)); + + UnitObjTables.push_back( + objTableToUnitObjTable(M, UnitObjTableType, LoadObj)); + UnitObjTables.push_back( + objTableToUnitObjTable(M, UnitObjTableType, StoreObj)); + UnitObjTables.push_back( + objTableToUnitObjTable(M, UnitObjTableType, AllocaObj)); + UnitObjTables.push_back( + objTableToUnitObjTable(M, UnitObjTableType, AllocFnObj)); +} + +CallInst *CilkSanitizerImpl::createRTUnitInitCall(IRBuilder<> &IRB) { + LLVMContext &C = M.getContext(); + + StructType *UnitFedTableType = + getUnitFedTableType(C, FrontEndDataTable::getPointerType(C)); + StructType *UnitObjTableType = + getUnitObjTableType(C, ObjectTable::getPointerType(C)); + + // Lookup __csanrt_unit_init + SmallVector InitArgTypes({IRB.getInt8PtrTy(), + PointerType::get(UnitFedTableType, 0), + PointerType::get(UnitObjTableType, 0), + InitCallsiteToFunction->getType()}); + FunctionType *InitFunctionTy = + FunctionType::get(IRB.getVoidTy(), InitArgTypes, false); + RTUnitInit = M.getOrInsertFunction(CsanRtUnitInitName, InitFunctionTy); + assert(isa(RTUnitInit.getCallee()) && + "Failed to get or insert __csanrt_unit_init function"); + + ArrayType *UnitFedTableArrayType = + ArrayType::get(UnitFedTableType, UnitFedTables.size()); + Constant *FEDTable = ConstantArray::get(UnitFedTableArrayType, UnitFedTables); + GlobalVariable *FEDGV = new GlobalVariable(M, UnitFedTableArrayType, false, + GlobalValue::InternalLinkage, FEDTable, + CsiUnitFedTableArrayName); + + ArrayType *UnitObjTableArrayType = + ArrayType::get(UnitObjTableType, UnitObjTables.size()); + Constant *ObjTable = ConstantArray::get(UnitObjTableArrayType, UnitObjTables); + GlobalVariable *ObjGV = new GlobalVariable(M, UnitObjTableArrayType, false, + GlobalValue::InternalLinkage, ObjTable, + CsiUnitObjTableArrayName); + + Constant *Zero = ConstantInt::get(IRB.getInt32Ty(), 0); + Value *GepArgs[] = {Zero, Zero}; + + // Insert call to __csanrt_unit_init + return IRB.CreateCall( + RTUnitInit, + {IRB.CreateGlobalStringPtr(M.getName()), + ConstantExpr::getGetElementPtr(FEDGV->getValueType(), FEDGV, GepArgs), + ConstantExpr::getGetElementPtr(ObjGV->getValueType(), ObjGV, GepArgs), + InitCallsiteToFunction}); +} + +// Initialize all instrumentation hooks that are specific to CilkSanitizer. +void CilkSanitizerImpl::initializeCsanHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + Type *FuncPropertyTy = CsiFuncProperty::getType(C); + Type *FuncExitPropertyTy = CsiFuncExitProperty::getType(C); + Type *TaskPropertyTy = CsiTaskProperty::getType(C); + Type *TaskExitPropertyTy = CsiTaskExitProperty::getType(C); + Type *LoadPropertyTy = CsiLoadStoreProperty::getType(C); + Type *StorePropertyTy = CsiLoadStoreProperty::getType(C); + Type *CallPropertyTy = CsiCallProperty::getType(C); + Type *LoopPropertyTy = CsiLoopProperty::getType(C); + Type *AllocFnPropertyTy = CsiAllocFnProperty::getType(C); + Type *FreePropertyTy = CsiFreeProperty::getType(C); + Type *DetachPropertyTy = CsiDetachProperty::getType(C); + Type *DetContPropertyTy = CsiDetachContinueProperty::getType(C); + Type *RetType = IRB.getVoidTy(); + Type *AddrType = IRB.getInt8PtrTy(); + Type *NumBytesType = IRB.getInt32Ty(); + Type *LargeNumBytesType = IntptrTy; + Type *IDType = IRB.getInt64Ty(); + Type *SyncRegType = IRB.getInt32Ty(); + + { + AttributeList FnAttrs; + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::NoCapture); + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::ReadNone); + FnAttrs = FnAttrs.addParamAttribute(C, 2, Attribute::NoCapture); + FnAttrs = FnAttrs.addParamAttribute(C, 2, Attribute::ReadNone); + CsanFuncEntry = getHookFunction("__csan_func_entry", FnAttrs, RetType, + /* func_id */ IDType, + /* frame_ptr */ AddrType, + /* stack_ptr */ AddrType, FuncPropertyTy); + } + { + CsanFuncExit = getHookFunction("__csan_func_exit", RetType, + /* func_exit_id */ IDType, + /* func_id */ IDType, FuncExitPropertyTy); + } + + { + AttributeList FnAttrs; + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::NoCapture); + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::ReadNone); + CsanRead = getHookFunction("__csan_load", FnAttrs, RetType, IDType, + AddrType, NumBytesType, LoadPropertyTy); + } + { + AttributeList FnAttrs; + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::NoCapture); + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::ReadNone); + CsanWrite = getHookFunction("__csan_store", FnAttrs, RetType, IDType, + AddrType, NumBytesType, StorePropertyTy); + } + { + AttributeList FnAttrs; + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::NoCapture); + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::ReadNone); + CsanLargeRead = + getHookFunction("__csan_large_load", FnAttrs, RetType, IDType, AddrType, + LargeNumBytesType, LoadPropertyTy); + } + { + AttributeList FnAttrs; + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::NoCapture); + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::ReadNone); + CsanLargeWrite = + getHookFunction("__csan_large_store", FnAttrs, RetType, IDType, + AddrType, LargeNumBytesType, StorePropertyTy); + } + + { + CsanBeforeCallsite = getHookFunction( + "__csan_before_call", IRB.getVoidTy(), IDType, + /*callee func_id*/ IDType, IRB.getInt8Ty(), CallPropertyTy); + } + { + CsanAfterCallsite = + getHookFunction("__csan_after_call", IRB.getVoidTy(), IDType, IDType, + IRB.getInt8Ty(), CallPropertyTy); + } + + { + CsanDetach = getHookFunction("__csan_detach", RetType, + /* detach_id */ IDType, + /* sync_reg */ SyncRegType, DetachPropertyTy); + } + { + AttributeList FnAttrs; + FnAttrs = FnAttrs.addParamAttribute(C, 2, Attribute::NoCapture); + FnAttrs = FnAttrs.addParamAttribute(C, 2, Attribute::ReadNone); + FnAttrs = FnAttrs.addParamAttribute(C, 3, Attribute::NoCapture); + FnAttrs = FnAttrs.addParamAttribute(C, 3, Attribute::ReadNone); + CsanTaskEntry = getHookFunction("__csan_task", FnAttrs, RetType, + /* task_id */ IDType, + /* detach_id */ IDType, + /* frame_ptr */ AddrType, + /* stack_ptr */ AddrType, TaskPropertyTy); + } + { + CsanTaskExit = + getHookFunction("__csan_task_exit", RetType, + /* task_exit_id */ IDType, + /* task_id */ IDType, + /* detach_id */ IDType, + /* sync_reg */ SyncRegType, TaskExitPropertyTy); + } + { + CsanDetachContinue = + getHookFunction("__csan_detach_continue", RetType, + /* detach_continue_id */ IDType, + /* detach_id */ IDType, + /* sync_reg */ SyncRegType, DetContPropertyTy); + } + { + CsanSync = getHookFunction("__csan_sync", RetType, IDType, + /* sync_reg */ SyncRegType); + } + + { + AttributeList FnAttrs; + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::NoCapture); + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::ReadNone); + FnAttrs = FnAttrs.addParamAttribute(C, 5, Attribute::NoCapture); + FnAttrs = FnAttrs.addParamAttribute(C, 5, Attribute::ReadNone); + CsanAfterAllocFn = getHookFunction( + "__csan_after_allocfn", FnAttrs, RetType, IDType, + /* new ptr */ AddrType, /* size */ LargeNumBytesType, + /* num elements */ LargeNumBytesType, /* alignment */ LargeNumBytesType, + /* old ptr */ AddrType, /* property */ AllocFnPropertyTy); + } + { + AttributeList FnAttrs; + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::NoCapture); + FnAttrs = FnAttrs.addParamAttribute(C, 1, Attribute::ReadNone); + CsanAfterFree = + getHookFunction("__csan_after_free", FnAttrs, RetType, IDType, AddrType, + /* property */ FreePropertyTy); + } + + { + CsanDisableChecking = + getHookFunction("__cilksan_disable_checking", RetType); + } + { + CsanEnableChecking = getHookFunction("__cilksan_enable_checking", + RetType); + } + + Type *MAAPTy = IRB.getInt8Ty(); + { + AttributeList FnAttrs; + FnAttrs = FnAttrs.addParamAttribute(C, 0, Attribute::NoCapture); + GetMAAP = + getHookFunction("__csan_get_MAAP", FnAttrs, RetType, + PointerType::get(MAAPTy, 0), IDType, IRB.getInt8Ty()); + // Unlike other hooks, GetMAAP writes to its pointer argument. Make sure + // the MemoryEffects on the hook reflect this fact. + Function *HookFn = cast(GetMAAP.getCallee()); + HookFn->setMemoryEffects(HookFn->getMemoryEffects() | + MemoryEffects::argMemOnly(ModRefInfo::ModRef)); + } + { + SetMAAP = getHookFunction("__csan_set_MAAP", RetType, MAAPTy, IDType); + } + + { + CsanBeforeLoop = getHookFunction("__csan_before_loop", IRB.getVoidTy(), + IDType, IRB.getInt64Ty(), LoopPropertyTy); + } + { + CsanAfterLoop = getHookFunction("__csan_after_loop", IRB.getVoidTy(), + IDType, IRB.getInt8Ty(), LoopPropertyTy); + } + + // Cilksan-specific attributes on CSI hooks + Function *CsiAfterAllocaFn = cast(CsiAfterAlloca.getCallee()); + CsiAfterAllocaFn->addParamAttr(1, Attribute::NoCapture); + CsiAfterAllocaFn->addParamAttr(1, Attribute::ReadNone); + CsiAfterAllocaFn->setOnlyAccessesInaccessibleMemOrArgMem(); + CsiAfterAllocaFn->setDoesNotThrow(); +} + +static BasicBlock *SplitOffPreds( + BasicBlock *BB, SmallVectorImpl &Preds, DominatorTree *DT, + LoopInfo *LI) { + if (BB->isLandingPad()) { + SmallVector NewBBs; + SplitLandingPadPredecessors(BB, Preds, ".csi-split-lp", ".csi-split", + NewBBs, DT, LI); + return NewBBs[1]; + } + + BasicBlock *NewBB = SplitBlockPredecessors(BB, Preds, ".csi-split", DT, LI); + if (isa(BB->getFirstNonPHIOrDbg())) { + // If the block being split is simply contains an unreachable, then replace + // the terminator of the new block with an unreachable. This helps preserve + // invariants on the CFG structure for Tapir placeholder blocks following + // detached.rethrow and taskframe.resume terminators. + ReplaceInstWithInst(NewBB->getTerminator(), + new UnreachableInst(BB->getContext())); + if (DT) { + DT->deleteEdge(NewBB, BB); + } + } + return BB; +} + +// Setup each block such that all of its predecessors belong to the same CSI ID +// space. +static void setupBlock(BasicBlock *BB, DominatorTree *DT, LoopInfo *LI, + const TargetLibraryInfo *TLI) { + if (BB->isLandingPad()) { + LandingPadInst *LPad = BB->getLandingPadInst(); + if (!LPad->isCleanup()) + LPad->setCleanup(true); + } + + if (BB->getUniquePredecessor()) + return; + + SmallVector DetachPreds; + SmallVector TFResumePreds; + SmallVector SyncPreds; + SmallVector SyncUnwindPreds; + SmallVector AllocFnPreds; + SmallVector FreeFnPreds; + DenseMap> LibCallPreds; + SmallVector InvokePreds; + bool HasOtherPredTypes = false; + unsigned NumPredTypes = 0; + + // Partition the predecessors of the landing pad. + for (BasicBlock *Pred : predecessors(BB)) { + if (isa(Pred->getTerminator()) || + isa(Pred->getTerminator()) || + isDetachedRethrow(Pred->getTerminator())) + DetachPreds.push_back(Pred); + else if (isTaskFrameResume(Pred->getTerminator())) + TFResumePreds.push_back(Pred); + else if (isa(Pred->getTerminator())) + SyncPreds.push_back(Pred); + else if (isSyncUnwind(Pred->getTerminator())) + SyncUnwindPreds.push_back(Pred); + else if (CilkSanitizerImpl::isAllocFn(Pred->getTerminator(), TLI)) + AllocFnPreds.push_back(Pred); + else if (CilkSanitizerImpl::isFreeFn(Pred->getTerminator(), TLI)) + FreeFnPreds.push_back(Pred); + else if (CilkSanitizerImpl::isLibCall(*Pred->getTerminator(), TLI)) { + const Function *Called = + dyn_cast(Pred->getTerminator())->getCalledFunction(); + LibCallPreds[Called].push_back(Pred); + } else if (isa(Pred->getTerminator())) + InvokePreds.push_back(Pred); + else + HasOtherPredTypes = true; + } + + NumPredTypes = static_cast(!DetachPreds.empty()) + + static_cast(!TFResumePreds.empty()) + + static_cast(!SyncPreds.empty()) + + static_cast(!SyncUnwindPreds.empty()) + + static_cast(!AllocFnPreds.empty()) + + static_cast(!FreeFnPreds.empty()) + + static_cast(LibCallPreds.size()) + + static_cast(!InvokePreds.empty()) + + static_cast(HasOtherPredTypes); + + // Splitting predecessors works differently for landingpads versus normal + // basic blocks. If the block is not a landingpad, split off every type of + // predecessor. + unsigned NumPredTypesRequired = static_cast(BB->isLandingPad()); + if (NumPredTypes <= NumPredTypesRequired) + return; + + BasicBlock *BBToSplit = BB; + // Split off the predecessors of each type. + if (!SyncPreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, SyncPreds, DT, LI); + NumPredTypes--; + } + if (!SyncUnwindPreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, SyncUnwindPreds, DT, LI); + NumPredTypes--; + } + if (!AllocFnPreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, AllocFnPreds, DT, LI); + NumPredTypes--; + } + if (!FreeFnPreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, FreeFnPreds, DT, LI); + NumPredTypes--; + } + if (!LibCallPreds.empty() && NumPredTypes > NumPredTypesRequired) { + for (auto KeyVal : LibCallPreds) { + if (NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, KeyVal.second, DT, LI); + NumPredTypes--; + } + } + } + if (!InvokePreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, InvokePreds, DT, LI); + NumPredTypes--; + } + if (!TFResumePreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, TFResumePreds, DT, LI); + NumPredTypes--; + } + // We handle detach and detached.rethrow predecessors at the end to preserve + // invariants on the CFG structure about the deadness of basic blocks after + // detached-rethrows. + if (!DetachPreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, DetachPreds, DT, LI); + NumPredTypes--; + } +} + +// Setup all basic blocks such that each block's predecessors belong entirely to +// one CSI ID space. +void CilkSanitizerImpl::setupBlocks(Function &F, DominatorTree *DT, + LoopInfo *LI) { + SmallPtrSet BlocksToSetup; + for (BasicBlock &BB : F) { + if (BB.isLandingPad()) + BlocksToSetup.insert(&BB); + + if (InvokeInst *II = dyn_cast(BB.getTerminator())) { + if (!isTapirPlaceholderSuccessor(II->getNormalDest())) + BlocksToSetup.insert(II->getNormalDest()); + } else if (SyncInst *SI = dyn_cast(BB.getTerminator())) + BlocksToSetup.insert(SI->getSuccessor(0)); + } + + for (BasicBlock *BB : BlocksToSetup) + setupBlock(BB, DT, LI, &GetTLI(F)); +} + +// Do not instrument known races/"benign races" that come from compiler +// instrumentation. The user has no way of suppressing them. +static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) { + // Peel off GEPs and BitCasts. + Addr = Addr->stripInBoundsOffsets(); + + if (GlobalVariable *GV = dyn_cast(Addr)) { + if (GV->hasSection()) { + StringRef SectionName = GV->getSection(); + // Check if the global is in the PGO counters section. + auto OF = Triple(M->getTargetTriple()).getObjectFormat(); + if (SectionName.endswith( + getInstrProfSectionName(IPSK_cnts, OF, + /*AddSegmentInfo*/ false))) + return false; + } + + // Check if the global is private gcov data. + if (GV->getName().startswith("__llvm_gcov") || + GV->getName().startswith("__llvm_gcda")) + return false; + } + + // Do not instrument acesses from different address spaces; we cannot deal + // with them. + if (Addr) { + Type *PtrTy = cast(Addr->getType()->getScalarType()); + if (PtrTy->getPointerAddressSpace() != 0) + return false; + } + + return true; +} + +/// Returns true if Addr can only refer to a locally allocated base object, that +/// is, an object created via an AllocaInst or an AllocationFn. +bool CilkSanitizerImpl::LocalBaseObj(const Value *Addr, LoopInfo *LI, + const TargetLibraryInfo *TLI) const { + // If we don't have an address, give up. + if (!Addr) + return false; + + // Get the base objects that this address might refer to. + SmallVectorImpl &BaseObjs = lookupBaseObjects(Addr, LI); + + // If we could not determine the base objects, conservatively return false. + if (BaseObjs.empty()) + return false; + + // If any base object is not an alloca or allocation function, then it's not + // local. + for (const Value *BaseObj : BaseObjs) { + if (isa(BaseObj) || isNoAliasCall(BaseObj)) + continue; + + if (const Argument *A = dyn_cast(BaseObj)) + if (A->hasByValAttr()) + continue; + + LLVM_DEBUG(dbgs() << "Non-local base object " << *BaseObj << "\n"); + return false; + } + + return true; +} + +// Examine the uses of a Instruction AI to determine if it is used in a subtask. +// This method assumes that AI is an allocation instruction, i.e., either an +// AllocaInst or an AllocationFn. +bool CilkSanitizerImpl::MightHaveDetachedUse(const Value *V, + const TaskInfo &TI) const { + // Get the task for this allocation. + const Task *AllocTask = nullptr; + if (const Instruction *I = dyn_cast(V)) + AllocTask = TI.getTaskFor(I->getParent()); + else if (const Argument *A = dyn_cast(V)) + AllocTask = TI.getTaskFor(&A->getParent()->getEntryBlock()); + + // assert(AllocTask && "Null task for instruction."); + if (!AllocTask) { + LLVM_DEBUG(dbgs() << "MightHaveDetachedUse: No task found for given value " + << *V << "\n"); + return false; + } + + if (AllocTask->isSerial()) + // Alloc AI cannot be used in a subtask if its enclosing task is serial. + return false; + + SmallVector Worklist; + SmallSet Visited; + + // Add all uses of AI to the worklist. + for (const Use &U : V->uses()) { + Visited.insert(&U); + Worklist.push_back(&U); + } + + // Evaluate each use of AI. + while (!Worklist.empty()) { + const Use *U = Worklist.pop_back_val(); + + // Check if this use of AI is in a different task from the allocation. + Instruction *I = cast(U->getUser()); + LLVM_DEBUG(dbgs() << "\tExamining use: " << *I << "\n"); + if (AllocTask != TI.getTaskFor(I->getParent())) { + assert(TI.getTaskFor(I->getParent()) != AllocTask->getParentTask() && + "Use of alloca appears in a parent task of that alloca"); + // Because the use of AI cannot appear in a parent task of AI, it must be + // in a subtask. In particular, the use cannot be in a shared-EH spindle. + return true; + } + + // If the pointer to AI is transformed using one of the following + // operations, add uses of the transformed pointer to the worklist. + switch (I->getOpcode()) { + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::PHI: + case Instruction::Select: + case Instruction::AddrSpaceCast: + for (Use &UU : I->uses()) + if (Visited.insert(&UU).second) + Worklist.push_back(&UU); + break; + default: + break; + } + } + return false; +} + +/// Returns true if accesses on Addr could race due to pointer capture. +bool CilkSanitizerImpl::PossibleRaceByCapture(const Value *Addr, + const TaskInfo &TI, + LoopInfo *LI) const { + if (isa(Addr)) + // For this analysis, we consider all global values to be captured. + return true; + + // Check for detached uses of the underlying base objects. + SmallVectorImpl &BaseObjs = lookupBaseObjects(Addr, LI); + + // If we could not determine the base objects, conservatively return true. + if (BaseObjs.empty()) + return true; + + for (const Value *BaseObj : BaseObjs) { + // Skip any null objects + if (const Constant *C = dyn_cast(BaseObj)) { + // if (C->isNullValue()) + // continue; + // Is this value a constant that cannot be derived from any pointer + // value (we need to exclude constant expressions, for example, that + // are formed from arithmetic on global symbols). + bool IsNonPtrConst = isa(C) || isa(C) || + isa(C) || + isa(C) || isa(C); + if (IsNonPtrConst) + continue; + } + + // If the base object is not an instruction, conservatively return true. + if (!isa(BaseObj)) { + // From BasicAliasAnalysis.cpp: If this is an argument that corresponds to + // a byval or noalias argument, then it has not escaped before entering + // the function. + if (const Argument *A = dyn_cast(BaseObj)) { + if (!A->hasByValAttr() && !A->hasNoAliasAttr()) + return true; + } else + return true; + } + + // If the base object might have a detached use, return true. + if (lookupMightHaveDetachedUse(BaseObj, TI)) + return true; + } + + // Perform normal pointer-capture analysis. + // if (PointerMayBeCaptured(Addr, false, false)) + if (lookupPointerMayBeCaptured(Addr)) + return true; + + return false; +} + +bool CilkSanitizerImpl::unknownObjectUses(const Value *Addr, LoopInfo *LI, + const TargetLibraryInfo *TLI) const { + // Perform normal pointer-capture analysis. + if (lookupPointerMayBeCaptured(Addr)) + return true; + + // Check for detached uses of the underlying base objects. + SmallVectorImpl &BaseObjs = lookupBaseObjects(Addr, LI); + + // If we could not determine the base objects, conservatively return true. + if (BaseObjs.empty()) + return true; + + // If the base object is not an allocation function, return true. + for (const Value *BaseObj : BaseObjs) + if (!isAllocFn(BaseObj, TLI)) + return true; + + return false; +} + +void CilkSanitizerImpl::chooseInstructionsToInstrument( + SmallVectorImpl &Local, SmallVectorImpl &All, + const TaskInfo &TI, LoopInfo &LI, const TargetLibraryInfo *TLI) { + SmallSet WriteTargets; + // Iterate from the end. + for (Instruction *I : reverse(Local)) { + if (StoreInst *Store = dyn_cast(I)) { + Value *Addr = Store->getPointerOperand(); + if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr)) + continue; + WriteTargets.insert(Addr); + } else { + LoadInst *Load = cast(I); + Value *Addr = Load->getPointerOperand(); + if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr)) + continue; + if (WriteTargets.count(Addr)) { + // We will write to this temp, so no reason to analyze the read. + NumOmittedReadsBeforeWrite++; + continue; + } + if (addrPointsToConstantData(Addr)) { + // Addr points to some constant data -- it can not race with any writes. + NumOmittedReadsFromConstants++; + continue; + } + } + Value *Addr = isa(*I) + ? cast(I)->getPointerOperand() + : cast(I)->getPointerOperand(); + if (LocalBaseObj(Addr, &LI, TLI) && + !PossibleRaceByCapture(Addr, TI, &LI)) { + // The variable is addressable but not captured, so it cannot be + // referenced from a different thread and participate in a data race + // (see llvm/Analysis/CaptureTracking.h for details). + NumOmittedNonCaptured++; + continue; + } + LLVM_DEBUG(dbgs() << "Pushing " << *I << "\n"); + All.push_back(I); + } + Local.clear(); +} + +bool CilkSanitizerImpl::isLibCall(const Instruction &I, + const TargetLibraryInfo *TLI) { + if (!isa(I)) + return false; + + if (!TLI) + return false; + + if (const Function *Called = dyn_cast(&I)->getCalledFunction()) { + LibFunc F; + bool FoundLibFunc = TLI->getLibFunc(*Called, F); + if (FoundLibFunc) + return true; + } + + return false; +} + +// Helper function to determine if the call-base instruction \p I should be +// skipped when examining calls that affect race detection. Returns true if and +// only if \p I is a simple call that cannot race. +bool CilkSanitizerImpl::simpleCallCannotRace(const Instruction &I) { + return callsPlaceholderFunction(I); +} + +// Helper function to determine if the call-base instruction \p I should be +// skipped when examining calls that affect race detection. Returns true if and +// only if \p I is identified as a special function that should be ignored. +bool CilkSanitizerImpl::shouldIgnoreCall(const Instruction &I) { + if (const CallBase *Call = dyn_cast(&I)) + if (const Function *Called = Call->getCalledFunction()) + if (Called->hasName() && (Called->getName().startswith("__csi") || + Called->getName().startswith("__csan") || + Called->getName().startswith("__cilksan"))) + return true; + return false; +} + +// Helper function to get the ID of a function being called. These IDs are +// stored in separate global variables in the program. This method will create +// a new global variable for the Callee's ID if necessary. +Value *CilkSanitizerImpl::GetCalleeFuncID(const Function *Callee, + IRBuilder<> &IRB) { + if (!Callee) + // Unknown targets (i.e., indirect calls) are always unknown. + return IRB.getInt64(CsiCallsiteUnknownTargetId); + + std::string GVName = + CsiFuncIdVariablePrefix + Callee->getName().str(); + GlobalVariable *FuncIdGV = M.getNamedGlobal(GVName); + Type *FuncIdGVTy = IRB.getInt64Ty(); + if (!FuncIdGV) { + FuncIdGV = + dyn_cast(M.getOrInsertGlobal(GVName, FuncIdGVTy)); + assert(FuncIdGV); + FuncIdGV->setConstant(false); + if (Options.jitMode && !Callee->empty()) + FuncIdGV->setLinkage(Callee->getLinkage()); + else + FuncIdGV->setLinkage(GlobalValue::WeakAnyLinkage); + FuncIdGV->setInitializer(IRB.getInt64(CsiCallsiteUnknownTargetId)); + } + return IRB.CreateLoad(FuncIdGVTy, FuncIdGV); +} + +//------------------------------------------------------------------------------ +// SimpleInstrumentor methods, which do not do static race detection. +//------------------------------------------------------------------------------ + +bool CilkSanitizerImpl::SimpleInstrumentor::InstrumentSimpleInstructions( + SmallVectorImpl &Instructions) { + bool Result = false; + for (Instruction *I : Instructions) { + bool LocalResult = false; + if (isa(I) || isa(I)) + LocalResult |= CilkSanImpl.instrumentLoadOrStore(I); + else if (isa(I) || isa(I)) + LocalResult |= CilkSanImpl.instrumentAtomic(I); + else + dbgs() << "[Cilksan] Unknown simple instruction: " << *I << "\n"; + + if (LocalResult) { + Result |= LocalResult; + // Record the detaches for the task containing this instruction. These + // detaches need to be instrumented. + getDetachesForInstruction(I); + } + } + return Result; +} + +bool CilkSanitizerImpl::SimpleInstrumentor::InstrumentAnyMemIntrinsics( + SmallVectorImpl &MemIntrinsics) { + bool Result = false; + for (Instruction *I : MemIntrinsics) { + bool LocalResult = false; + if (isa(I)) { + LocalResult |= CilkSanImpl.instrumentAnyMemIntrinAcc(I, /*Src*/ 1); + LocalResult |= CilkSanImpl.instrumentAnyMemIntrinAcc(I, /*Dst*/ 0); + } else { + assert(isa(I) && + "InstrumentAnyMemIntrinsics operating on not a memory intrinsic."); + LocalResult |= CilkSanImpl.instrumentAnyMemIntrinAcc(I, unsigned(-1)); + } + if (LocalResult) { + Result |= LocalResult; + // Record the detaches for the task containing this instruction. These + // detaches need to be instrumented. + getDetachesForInstruction(I); + } + } + return Result; +} + +bool CilkSanitizerImpl::SimpleInstrumentor::InstrumentCalls( + SmallVectorImpl &Calls) { + bool Result = false; + for (Instruction *I : Calls) { + // Allocation-function and free calls are handled separately. + if (isAllocFn(I, TLI) || isFreeFn(I, TLI)) + continue; + + bool LocalResult = false; + if (isa(I)) + LocalResult |= + CilkSanImpl.instrumentIntrinsicCall(I, /*MAAPVals*/ nullptr); + else if (isLibCall(*I, TLI)) + LocalResult |= + CilkSanImpl.instrumentLibCall(I, /*MAAPVals*/ nullptr); + else + LocalResult |= CilkSanImpl.instrumentCallsite(I, /*MAAPVals*/ nullptr); + if (LocalResult) { + Result |= LocalResult; + // Record the detaches for the task containing this instruction. These + // detaches need to be instrumented. + getDetachesForInstruction(I); + } + } + return Result; +} + +bool CilkSanitizerImpl::SimpleInstrumentor::InstrumentAncillaryInstructions( + SmallPtrSetImpl &Allocas, + SmallPtrSetImpl &AllocationFnCalls, + SmallPtrSetImpl &FreeCalls, + DenseMap &SyncRegNums, + DenseMap &SRCounters, const DataLayout &DL) { + bool Result = false; + SmallPtrSet Syncs; + SmallPtrSet Loops; + + // Instrument allocas and allocation-function calls that may be involved in a + // race. + for (Instruction *I : Allocas) { + // The simple instrumentor just instruments everything + CilkSanImpl.instrumentAlloca(I, TI); + getDetachesForInstruction(I); + Result = true; + } + for (Instruction *I : AllocationFnCalls) { + // The simple instrumentor just instruments everything + CilkSanImpl.instrumentAllocationFn(I, DT, TLI); + getDetachesForInstruction(I); + Result = true; + } + for (Instruction *I : FreeCalls) { + // The first argument of the free call is the pointer. + Value *Ptr = I->getOperand(0); + // If the pointer corresponds to an allocation function call in this + // function, then instrument it. + if (Instruction *PtrI = dyn_cast(Ptr)) { + if (AllocationFnCalls.count(PtrI)) { + CilkSanImpl.instrumentFree(I, TLI); + getDetachesForInstruction(I); + Result = true; + continue; + } + } + // The simple instrumentor just instruments everything + CilkSanImpl.instrumentFree(I, TLI); + getDetachesForInstruction(I); + Result = true; + } + + // Instrument detaches + for (DetachInst *DI : Detaches) { + CilkSanImpl.instrumentDetach(DI, SyncRegNums[DI->getSyncRegion()], + SRCounters[DI->getDetached()], DT, TI, LI); + Result = true; + // Get syncs associated with this detach + for (SyncInst *SI : CilkSanImpl.DetachToSync[DI]) + Syncs.insert(SI); + + if (CilkSanImpl.Options.InstrumentLoops) { + // Get any loop associated with this detach. + Loop *L = LI.getLoopFor(DI->getParent()); + if (spawnsTapirLoopBody(DI, LI, TI)) + Loops.insert(L); + } + } + + // Instrument associated syncs + for (SyncInst *SI : Syncs) + CilkSanImpl.instrumentSync(SI, SyncRegNums[SI->getSyncRegion()]); + + if (CilkSanImpl.Options.InstrumentLoops) { + // Recursively instrument all Tapir loops + for (Loop *L : Loops) + CilkSanImpl.instrumentTapirLoop(*L, TI, SyncRegNums); + } + + return Result; +} + +// TODO: Combine this redundant logic with that in Instrumentor +void CilkSanitizerImpl::SimpleInstrumentor::getDetachesForInstruction( + Instruction *I) { + // Get the Task for I. + Task *T = TI.getTaskFor(I->getParent()); + // Add the ancestors of T to the set of detaches to instrument. + while (!T->isRootTask()) { + // Once we encounter a detach we've previously added to the set, we know + // that all its parents are also in the set. + if (!Detaches.insert(T->getDetach()).second) + return; + T = T->getParentTask(); + } +} + +//------------------------------------------------------------------------------ +// Instrumentor methods +//------------------------------------------------------------------------------ + +void CilkSanitizerImpl::Instrumentor::getDetachesForInstruction( + Instruction *I) { + // Get the Task for I. + Task *T = TI.getTaskFor(I->getParent()); + // Add the ancestors of T to the set of detaches to instrument. + while (!T->isRootTask()) { + // Once we encounter a detach we've previously added to the set, we know + // that all its parents are also in the set. + if (!Detaches.insert(T->getDetach()).second) + return; + T = T->getParentTask(); + } +} + +unsigned CilkSanitizerImpl::Instrumentor::RaceTypeToFlagVal( + RaceInfo::RaceType RT) { + unsigned FlagVal = static_cast(MAAPValue::NoAccess); + if (RaceInfo::isLocalRace(RT) || RaceInfo::isOpaqueRace(RT)) + FlagVal = static_cast(MAAPValue::ModRef); + if (RaceInfo::isRaceViaAncestorMod(RT)) + FlagVal |= static_cast(MAAPValue::Mod); + if (RaceInfo::isRaceViaAncestorRef(RT)) + FlagVal |= static_cast(MAAPValue::Ref); + return FlagVal; +} + +static Value *getMAAPIRValue(IRBuilder<> &IRB, unsigned MV) { + return IRB.getInt8(MV); +} + +// Insert per-argument MAAPs for this function +void CilkSanitizerImpl::Instrumentor::InsertArgMAAPs(Function &F, + Value *FuncId) { + if (!MAAPChecks) + return; + LLVM_DEBUG(dbgs() << "InsertArgMAAPs: " << F.getName() << "\n"); + IRBuilder<> IRB(cast(FuncId)->getNextNode()); + unsigned ArgIdx = 0; + for (Argument &Arg : F.args()) { + if (!Arg.getType()->isPtrOrPtrVectorTy()) + continue; + + // Create a new flag for this argument MAAP. + Type *MAAPIRValueTy = getMAAPIRValue(IRB, 0)->getType(); + Value *NewFlag = IRB.CreateAlloca(MAAPIRValueTy, + Arg.getType()->getPointerAddressSpace()); + Value *FinalMV; + // If this function is main, then it has no ancestors that can create races. + if (F.getName() == "main") { + FinalMV = getMAAPIRValue(IRB, RaceTypeToFlagVal(RaceInfo::None)); + IRB.CreateStore(FinalMV, NewFlag); + } else { + // Call the runtime function to set the value of this flag. + IRB.CreateCall(CilkSanImpl.GetMAAP, + {NewFlag, FuncId, IRB.getInt8(ArgIdx)}); + + // Incorporate local information into this MAAP value. + unsigned LocalMV = static_cast(MAAPValue::NoAccess); + if (Arg.hasNoAliasAttr()) + LocalMV |= static_cast(MAAPValue::NoAlias); + + // Store this local MAAP value. + FinalMV = IRB.CreateOr(getMAAPIRValue(IRB, LocalMV), + IRB.CreateLoad(MAAPIRValueTy, NewFlag)); + IRB.CreateStore(FinalMV, NewFlag); + } + // Associate this flag with the argument for future lookups. + LLVM_DEBUG(dbgs() << "Recording local MAAP for arg " << Arg << ": " + << *NewFlag << "\n"); + LocalMAAPs[&Arg] = FinalMV; + ArgMAAPs.insert(FinalMV); + ++ArgIdx; + } + + // Record other objects known to be involved in races. + for (auto &ObjRD : RI.getObjectMRForRace()) { + if (isa(ObjRD.first)) { + unsigned MAAPVal = static_cast(MAAPValue::NoAccess); + if (isModSet(ObjRD.second)) + MAAPVal |= static_cast(MAAPValue::Mod); + if (isRefSet(ObjRD.second)) + MAAPVal |= static_cast(MAAPValue::Ref); + // Determine if this object is no-alias. + if (const CallBase *CB = dyn_cast(ObjRD.first)) { + if (CB->hasRetAttr(Attribute::NoAlias)) + MAAPVal |= static_cast(MAAPValue::NoAlias); + } else if (isa(ObjRD.first)) + MAAPVal |= static_cast(MAAPValue::NoAlias); + + LLVM_DEBUG(dbgs() << "Setting LocalMAAPs for " << *ObjRD.first << " = " + << MAAPVal << "\n"); + LocalMAAPs[ObjRD.first] = getMAAPIRValue(IRB, MAAPVal); + } + } +} + +bool CilkSanitizerImpl::Instrumentor::InstrumentSimpleInstructions( + SmallVectorImpl &Instructions) { + bool Result = false; + for (Instruction *I : Instructions) { + bool LocalResult = false; + // Simple instructions, such as loads, stores, or atomics, have just one + // pointer operand, and therefore should have at most one entry of RaceData. + + // If the instruction might participate in a local or opaque race, + // instrument it unconditionally. + if (RI.mightRaceOpaquely(I)) { + if (isa(I) || isa(I)) + LocalResult |= CilkSanImpl.instrumentLoadOrStore(I); + else if (isa(I) || isa(I)) + LocalResult |= CilkSanImpl.instrumentAtomic(I); + else + dbgs() << "[Cilksan] Unknown simple instruction: " << *I << "\n"; + } else if (RI.mightRaceViaAncestor(I) || RI.mightRaceLocally(I)) { + // Otherwise, if the instruction might participate in a race via an + // ancestor function instantiation, instrument it conditionally, based on + // the pointer. + // + // Delay handling this instruction. + DelayedSimpleInsts.push_back(I); + LocalResult |= true; + } + + // If any instrumentation was inserted, collect associated instructions to + // instrument. + if (LocalResult) { + Result |= LocalResult; + // Record the detaches for the task containing this instruction. These + // detaches need to be instrumented. + getDetachesForInstruction(I); + } + } + return Result; +} + +bool CilkSanitizerImpl::Instrumentor::InstrumentAnyMemIntrinsics( + SmallVectorImpl &MemIntrinsics) { + bool Result = false; + for (Instruction *I : MemIntrinsics) { + bool LocalResult = false; + // If this instruction cannot race, skip it. + if (!RI.mightRace(I)) + continue; + + // Look over the race data to determine what memory intrinsics need to be + // instrumented and how. + SmallSet, 2> ToInstrument; + SmallSet, 2> MaybeDelay; + for (const RaceInfo::RaceData &RD : RI.getRaceData(I)) { + assert(RD.getPtr() && "No pointer for race with memory intrinsic."); + if (RaceInfo::isOpaqueRace(RD.Type)) { + ToInstrument.insert(std::make_pair(I, RD.OperandNum)); + LocalResult |= true; + } else if (RaceInfo::isRaceViaAncestor(RD.Type) || + RaceInfo::isLocalRace(RD.Type)) { + // Possibly delay handling this instruction. + MaybeDelay.insert(std::make_pair(I, RD.OperandNum)); + LocalResult |= true; + } + } + + // Do the instrumentation + for (const std::pair &MemIntrin : ToInstrument) + CilkSanImpl.instrumentAnyMemIntrinAcc(MemIntrin.first, MemIntrin.second); + for (const std::pair &MemIntrin : MaybeDelay) + if (!ToInstrument.count(MemIntrin)) + DelayedMemIntrinsics.push_back(MemIntrin); + + // If any instrumentation was inserted, collect associated instructions to + // instrument. + if (LocalResult) { + Result |= LocalResult; + // Record the detaches for the task containing this instruction. These + // detaches need to be instrumented. + getDetachesForInstruction(I); + } + } + return Result; +} + +bool CilkSanitizerImpl::Instrumentor::InstrumentCalls( + SmallVectorImpl &Calls) { + bool Result = false; + for (Instruction *I : Calls) { + // Allocation-function and free calls are handled separately. + if (isAllocFn(I, TLI) || isFreeFn(I, TLI)) + continue; + + bool LocalResult = false; + bool GetDetaches = false; + + // Get current race data for this call. + RaceInfo::RaceType CallRT = RI.getRaceType(I); + LLVM_DEBUG({ + dbgs() << "Call " << *I << ": "; + RaceInfo::printRaceType(CallRT, dbgs()); + dbgs() << "\n"; + }); + + // Get update race data, if it's available. + RaceInfo::RaceType FuncRT = CallRT; + CallBase *CB = dyn_cast(I); + if (Function *CF = CB->getCalledFunction()) + if (CilkSanImpl.FunctionRaceType.count(CF)) + FuncRT = CilkSanImpl.FunctionRaceType[CF]; + + LLVM_DEBUG({ + dbgs() << " FuncRT: "; + RaceInfo::printRaceType(FuncRT, dbgs()); + dbgs() << "\n"; + }); + + // Propagate information about opaque races from function to call. + if (!RaceInfo::isOpaqueRace(FuncRT)) + CallRT = RaceInfo::clearOpaqueRace(CallRT); + + LLVM_DEBUG({ + dbgs() << " New CallRT: "; + RaceInfo::printRaceType(CallRT, dbgs()); + dbgs() << "\n"; + }); + + // If this instruction cannot race, see if we can suppress it + if (!RaceInfo::isRace(CallRT)) { + // Nothing to suppress if this is an intrinsic + if (isa(I)) + continue; + + // We can only suppress calls whose functions don't have local races. + if (!RaceInfo::isLocalRace(FuncRT)) { + if (!CB->doesNotAccessMemory()) + LocalResult |= CilkSanImpl.suppressCallsite(I); + continue; + // } else { + // GetDetaches |= CilkSanImpl.instrumentCallsite(I); + // // SmallPtrSet Objects; + // // RI.getObjectsFor(I, Objects); + // // for (Value *Obj : Objects) { + // // CilkSanImpl.ObjectMRForRace[Obj] = ModRefInfo::ModRef; + // // } + } + // continue; + } + + // We're going to instrument this call for potential races. First get + // MAAP information for its arguments, if any races depend on the + // ancestor. + SmallVector MAAPVals; + LLVM_DEBUG(dbgs() << "Getting MAAP values for " << *CB << "\n"); + IRBuilder<> IRB(I); + unsigned OpIdx = 0; + for (const Value *Op : CB->args()) { + if (!MAAPChecks) + continue; + + if (!Op->getType()->isPtrOrPtrVectorTy()) { + ++OpIdx; + continue; + } + + // Check if this operand might race via ancestor. + bool RaceViaAncestor = false; + for (const RaceInfo::RaceData &RD : RI.getRaceData(I)) { + if (RD.OperandNum != OpIdx) + continue; + if (RaceInfo::isRaceViaAncestor(RD.Type)) { + RaceViaAncestor = true; + break; + } + } + + Value *MAAPVal; + if (RaceViaAncestor) + // Evaluate race data for I and OpIdx to compute the MAAP value. + MAAPVal = getMAAPValue(I, IRB, OpIdx); + else + // We have either an opaque race or a local race, but _not_ a race via + // an ancestor. We want to propagate MAAP information on pointer + // arguments, but we don't need to be pessimistic when a value can't be + // found. + MAAPVal = getMAAPValue(I, IRB, OpIdx, MAAPValue::NoAccess, + /*CheckArgs*/ false); + LLVM_DEBUG({ + dbgs() << " Op: " << *CB->getArgOperand(OpIdx) << "\n"; + dbgs() << " MAAP value: " << *MAAPVal << "\n"; + }); + MAAPVals.push_back(MAAPVal); + ++OpIdx; + } + + Value *CalleeID = CilkSanImpl.GetCalleeFuncID(CB->getCalledFunction(), IRB); + // We set the MAAPs in reverse order to support stack-like access of the + // MAAPs by in-order calls to GetMAAP in the callee. + for (Value *MAAPVal : reverse(MAAPVals)) + IRB.CreateCall(CilkSanImpl.SetMAAP, {MAAPVal, CalleeID}); + + if (isa(I)) + GetDetaches |= CilkSanImpl.instrumentIntrinsicCall(I, &MAAPVals); + else if (isLibCall(*I, TLI)) + GetDetaches |= CilkSanImpl.instrumentLibCall(I, &MAAPVals); + else + GetDetaches |= CilkSanImpl.instrumentCallsite(I, &MAAPVals); + + // If any instrumentation was inserted, collect associated instructions to + // instrument. + Result |= LocalResult; + if (GetDetaches) { + Result |= GetDetaches; + // Record the detaches for the task containing this instruction. These + // detaches need to be instrumented. + getDetachesForInstruction(I); + } + } + return Result; +} + +Value *CilkSanitizerImpl::Instrumentor::readMAAPVal(Value *V, + IRBuilder<> &IRB) { + if (!ArgMAAPs.count(V)) + return V; + // Marking the load as invariant is not technically correct, because the + // __csan_get_MAAP call sets the value. But this call happens + // once, and all subsequent loads will return the same value. + // + // MDNode *MD = llvm::MDNode::get(IRB.getContext(), llvm::None); + // cast(Load)->setMetadata(LLVMContext::MD_invariant_load, MD); + + // TODO: See if there's a better way to annotate this load for optimization. + // LoadInst *I = IRB.CreateLoad(V); + // if (auto *IMD = I->getMetadata(LLVMContext::MD_invariant_group)) + // I->setMetadata(LLVMContext::MD_invariant_group, IMD); + // else + // I->setMetadata(LLVMContext::MD_invariant_group, + // MDNode::get(IRB.getContext(), {})); + Value *MV; + if (AllocaInst *A = dyn_cast(V)) + MV = IRB.CreateLoad(A->getAllocatedType(), A); + else + MV = V; + return MV; +} + +// Get the memory location for this instruction and operand. +static MemoryLocation getMemoryLocation(Instruction *I, unsigned OperandNum, + const TargetLibraryInfo *TLI) { + if (auto *MI = dyn_cast(I)) { + if (auto *MT = dyn_cast(I)) { + if (OperandNum == 1) + return MemoryLocation::getForSource(MT); + } + return MemoryLocation::getForDest(MI); + } else if (OperandNum == static_cast(-1)) { + return MemoryLocation::get(I); + } else { + assert(isa(I) && + "Unknown instruction and operand ID for getting MemoryLocation."); + CallBase *CB = cast(I); + return MemoryLocation::getForArgument(CB, OperandNum, TLI); + } +} + +// Evaluate the noalias value in the MAAP for Obj, and intersect that result +// with the noalias information for other objects. +Value *CilkSanitizerImpl::Instrumentor::getNoAliasMAAPValue( + Instruction *I, IRBuilder<> &IRB, unsigned OperandNum, + MemoryLocation Loc, const RaceInfo::RaceData &RD, const Value *Obj, + Value *ObjNoAliasFlag) { + AAResults *AA = RI.getAA(); + + for (const RaceInfo::RaceData &OtherRD : RI.getRaceData(I)) { + // Skip checking other accesses that don't involve a pointer + if (!OtherRD.Access.getPointer()) + continue; + // Skip this operand when scanning for aliases + if (OperandNum == OtherRD.OperandNum) + continue; + + // If we can tell statically that these two memory locations don't alias, + // move on. + if (!AA->alias(Loc, getMemoryLocation(I, OtherRD.OperandNum, TLI))) + continue; + + // We trust that the MAAP value in LocalMAAPs[] for this object Obj, set by + // InsertArgMAAPs, is correct. We need to check the underlying objects of + // the other arguments to see if they match this object. + + // Otherwise we check the underlying objects. + SmallPtrSet OtherObjects; + RI.getObjectsFor(OtherRD.Access, OtherObjects); + for (const Value *OtherObj : OtherObjects) { + // If we find another instance of this object in another argument, + // then we don't have "no alias". + if (Obj == OtherObj) { + LLVM_DEBUG({ + dbgs() << "getNoAliasMAAPValue: Matching objects found:\n"; + dbgs() << " Obj: " << *Obj << "\n"; + dbgs() << " I: " << *I << "\n"; + dbgs() << " Operands " << OperandNum << ", " << OtherRD.OperandNum + << "\n"; + }); + return getMAAPIRValue(IRB, 0); + } + + // We now know that Obj and OtherObj don't match. + + // If the other object is an argument, then we trust the noalias value in + // the MAAP for Obj. + if (isa(OtherObj)) + continue; + + // // If the other object is something we can't reason about locally, then we + // // give up. + // if (!isa(OtherObj)) + // return getMAAPIRValue(IRB, 0); + + // Otherwise, check if the other object might alias this one. + if (AA->alias(Loc, MemoryLocation::getBeforeOrAfter(OtherObj))) { + LLVM_DEBUG({ + dbgs() << "getNoAliasMAAPValue: Possible aliasing between:\n"; + dbgs() << " Obj: " << *Obj << "\n"; + dbgs() << " OtherObj: " << *OtherObj << "\n"; + }); + return getMAAPIRValue(IRB, 0); + } + } + } + return ObjNoAliasFlag; +} + +Value *CilkSanitizerImpl::Instrumentor::getMAAPValue(Instruction *I, + IRBuilder<> &IRB, + unsigned OperandNum, + MAAPValue DefaultMV, + bool CheckArgs) { + Function *F = I->getFunction(); + AAResults *AA = RI.getAA(); + MemoryLocation Loc = getMemoryLocation(I, OperandNum, TLI); + Value *MV = getMAAPIRValue(IRB, static_cast(MAAPValue::NoAccess)); + Value *DefaultMAAP = getMAAPIRValue(IRB, static_cast(DefaultMV)); + Value *NoAliasFlag = + getMAAPIRValue(IRB, static_cast(MAAPValue::NoAlias)); + + // If I is a call, check if any other arguments of this call alias the + // specified operand. + if (const CallBase *CB = dyn_cast(I)) { + unsigned OpIdx = 0; + bool FoundAliasingArg = false; + for (const Value *Arg : CB->args()) { + // Skip this operand and any operands that are not pointers. + if (OpIdx == OperandNum || !Arg->getType()->isPtrOrPtrVectorTy()) { + ++OpIdx; + continue; + } + + // If this argument does not alias Loc, skip it. + if (!AA->alias(Loc, getMemoryLocation(I, OpIdx, TLI))) { + ++OpIdx; + continue; + } + + // If the operands must alias, then discard the default noalias MAAP + // value. + AliasResult ArgAlias = AA->alias(Loc, getMemoryLocation(I, OpIdx, TLI)); + if (AliasResult::MustAlias == ArgAlias || + AliasResult::PartialAlias == ArgAlias) { + NoAliasFlag = getMAAPIRValue(IRB, 0); + break; + } + + // Get objects corresponding to this argument. + SmallPtrSet ArgObjects; + RI.getObjectsFor(RaceInfo::MemAccessInfo( + Arg, isModSet(AA->getArgModRefInfo(CB, OpIdx))), + ArgObjects); + for (const Value *Obj : ArgObjects) { + // If Loc and the racer object cannot alias, then there's nothing to + // check. + if (!AA->alias(Loc, MemoryLocation::getBeforeOrAfter(Obj))) + continue; + + // If we have no local MAAP data for Obj, then act pessimally. + if (!LocalMAAPs.count(Obj)) { + FoundAliasingArg = true; + break; + } + + // Intersect the dynamic noalias information for this object into the + // noalias flag. + Value *FlagLoad = readMAAPVal(LocalMAAPs[Obj], IRB); + Value *ObjNoAliasFlag = IRB.CreateAnd( + FlagLoad, + getMAAPIRValue(IRB, static_cast(MAAPValue::NoAlias))); + NoAliasFlag = IRB.CreateAnd(NoAliasFlag, ObjNoAliasFlag); + } + + if (FoundAliasingArg) { + // If we found an aliasing argument, fall back to noalias = false. + NoAliasFlag = getMAAPIRValue(IRB, 0); + break; + } + ++OpIdx; + } + } + + // Check the recorded race data for I. + for (const RaceInfo::RaceData &RD : RI.getRaceData(I)) { + // Skip race data for different operands of the same instruction. + if (OperandNum != RD.OperandNum) + continue; + + // Otherwise use information about the possibly accessed objects to + // determine the MAAP value. + SmallPtrSet Objects; + RI.getObjectsFor(RD.Access, Objects); + + // If we have a valid racer, get the objects that that racer might access. + SmallPtrSet RacerObjects; + unsigned LocalRaceVal = static_cast(MAAPValue::NoAccess); + if (RD.Racer.isValid()) { + // Get the local race value for this racer + assert(RaceInfo::isLocalRace(RD.Type) && "Valid racer for nonlocal race"); + RI.getObjectsFor( + RaceInfo::MemAccessInfo(RD.Racer.getPtr(), RD.Racer.isMod()), + RacerObjects); + if (RD.Racer.isMod()) + LocalRaceVal |= static_cast(MAAPValue::Mod); + if (RD.Racer.isRef()) + LocalRaceVal |= static_cast(MAAPValue::Ref); + } + + // Get MAAPs from objects + for (const Value *Obj : Objects) { + // If we find an object with no MAAP, give up. + if (!LocalMAAPs.count(Obj)) { + LLVM_DEBUG(dbgs() << "No local MAAP found for obj " << *Obj << "\n"); + if (RD.Racer.isValid()) + MV = IRB.CreateOr(MV, getMAAPIRValue(IRB, LocalRaceVal)); + else + MV = IRB.CreateOr(MV, DefaultMAAP); + continue; + } + + Value *FlagLoad = readMAAPVal(LocalMAAPs[Obj], IRB); + Value *FlagCheck = IRB.CreateAnd( + FlagLoad, getMAAPIRValue(IRB, RaceTypeToFlagVal(RD.Type))); + MV = IRB.CreateOr(MV, FlagCheck); + + // Get the dynamic no-alias bit from the MAAP value. + Value *ObjNoAliasFlag = IRB.CreateAnd( + FlagLoad, + getMAAPIRValue(IRB, static_cast(MAAPValue::NoAlias))); + Value *NoAliasCheck = + IRB.CreateICmpNE(getMAAPIRValue(IRB, 0), ObjNoAliasFlag); + + if (RD.Racer.isValid()) { + for (const Value *RObj : RacerObjects) { + // If the racer object matches Obj, there's no need to check a flag. + if (RObj == Obj) { + MV = IRB.CreateOr(MV, LocalRaceVal); + continue; + } + + // If Loc and the racer object cannot alias, then there's nothing to + // check. + if (!AA->alias(Loc, MemoryLocation::getBeforeOrAfter(RObj))) + continue; + + // If there is must or partial aliasing between this object and racer + // object, or we have no local MAAP information for RObj, then + // act conservatively, because there's nothing to check. + if (AliasResult::MustAlias == + AA->alias(Loc, MemoryLocation::getBeforeOrAfter(RObj)) || + AliasResult::PartialAlias == + AA->alias(Loc, MemoryLocation::getBeforeOrAfter(RObj)) || + !LocalMAAPs.count(RObj)) { + if (!LocalMAAPs.count(RObj)) + LLVM_DEBUG(dbgs() << "No local MAAP found for racer object " + << *RObj << "\n"); + else + LLVM_DEBUG(dbgs() << "AA indicates must or partial alias with " + "racer object " + << *RObj << "\n"); + MV = IRB.CreateOr(MV, LocalRaceVal); + continue; + } + + // These two objects may alias, based on static analysis. Check the + // dynamic MAAP values. We can suppress the race if either this + // object or the racer object is dynamically noalias, i.e., if either + // was derived from an allocation or noalias function argument. + Value *FlagLoad = readMAAPVal(LocalMAAPs[RObj], IRB); + Value *RObjNoAliasFlag = IRB.CreateAnd( + FlagLoad, + getMAAPIRValue(IRB, static_cast(MAAPValue::NoAlias))); + Value *RObjNoAliasCheck = + IRB.CreateICmpNE(getMAAPIRValue(IRB, 0), RObjNoAliasFlag); + Value *FlagCheck = IRB.CreateSelect( + IRB.CreateOr(NoAliasCheck, RObjNoAliasCheck), + getMAAPIRValue(IRB, 0), + IRB.CreateAnd(FlagLoad, getMAAPIRValue(IRB, LocalRaceVal))); + MV = IRB.CreateOr(MV, FlagCheck); + } + } else if (CheckArgs) { + // Check the function arguments that might alias this object. + for (Argument &Arg : F->args()) { + // Ignore non-pointer arguments + if (!Arg.getType()->isPtrOrPtrVectorTy()) + continue; + // Ignore any arguments that match checked objects. + if (&Arg == Obj) + continue; + // Check if Loc and Arg may alias. + if (!AA->alias(Loc, MemoryLocation::getBeforeOrAfter(&Arg))) + continue; + // If we have no local MAAP information about the argument, + // then there's nothing to check. + if (!LocalMAAPs.count(&Arg)) { + LLVM_DEBUG(dbgs() << "No local MAAP found for arg " << Arg << "\n"); + MV = IRB.CreateOr(MV, DefaultMAAP); + continue; + } + + // These two objects may alias, based on static analysis. Check the + // dynamic MAAP values. We can suppress the race if either + // this object or the racer object is dynamically noalias, i.e., if + // either was derived from an allocation or noalias function argument. + Value *FlagLoad = readMAAPVal(LocalMAAPs[&Arg], IRB); + Value *ArgNoAliasFlag = IRB.CreateAnd( + FlagLoad, + getMAAPIRValue(IRB, static_cast(MAAPValue::NoAlias))); + Value *ArgNoAliasCheck = + IRB.CreateICmpNE(getMAAPIRValue(IRB, 0), ArgNoAliasFlag); + Value *FlagCheck = IRB.CreateSelect( + IRB.CreateOr(NoAliasCheck, ArgNoAliasCheck), + getMAAPIRValue(IRB, 0), + IRB.CreateAnd(FlagLoad, + getMAAPIRValue(IRB, RaceTypeToFlagVal(RD.Type)))); + MV = IRB.CreateOr(MV, FlagCheck); + } + } + // Call getNoAliasMAAPValue to evaluate the no-alias value in the + // MAAP for Obj, and intersect that result with the noalias + // information for other objects. + NoAliasFlag = IRB.CreateAnd(NoAliasFlag, + getNoAliasMAAPValue(I, IRB, OperandNum, Loc, + RD, Obj, ObjNoAliasFlag)); + } + } + // Record the no-alias information. + MV = IRB.CreateOr(MV, NoAliasFlag); + return MV; +} + +Value *CilkSanitizerImpl::Instrumentor::getMAAPCheck(Instruction *I, + IRBuilder<> &IRB, + unsigned OperandNum) { + Function *F = I->getFunction(); + bool LocalRace = RI.mightRaceLocally(I); + AAResults *AA = RI.getAA(); + MemoryLocation Loc = getMemoryLocation(I, OperandNum, TLI); + Value *MAAPChk = IRB.getTrue(); + if (LocalRace) + return IRB.getFalse(); + + // Check the recorded race data for I. + for (const RaceInfo::RaceData &RD : RI.getRaceData(I)) { + LLVM_DEBUG(dbgs() << " Race Data:\n Ptr = " << *RD.getPtr() << "\n"); + LLVM_DEBUG(RaceInfo::printRaceType(RD.Type, dbgs() << " ")); + LLVM_DEBUG(dbgs() << "\n"); + LLVM_DEBUG(dbgs() << " current MAAPChk = " << *MAAPChk << "\n"); + // Skip race data for different operands of the same instruction. + if (OperandNum != RD.OperandNum) + continue; + + // If this racer is opaque, then we can't create a valid MAAP check for it. + if (RaceInfo::isOpaqueRace(RD.Type)) + return IRB.getFalse(); + + // If this racer is local, then skip it. We've already accommodated local + // races via runtime pointer checks, if available. + if (RaceInfo::isLocalRace(RD.Type)) + continue; + + LLVM_DEBUG(dbgs() << " Getting objects for racer\n"); + + SmallPtrSet Objects; + RI.getObjectsFor(RD.Access, Objects); + + // If we have a valid racer, get the objects that that racer might access. + SmallPtrSet RacerObjects; + + for (const Value *Obj : Objects) { + LLVM_DEBUG(dbgs() << " Object " << *Obj << "\n"); + LLVM_DEBUG(dbgs() << " current MAAPChk = " << *MAAPChk << "\n"); + // Ignore objects that are not involved in races. + if (!RI.ObjectInvolvedInRace(Obj)) + continue; + + // If we find an object with no MAAP, give up. + if (!LocalMAAPs.count(Obj)) { + LLVM_DEBUG(dbgs() << "No local MAAP found for object " << *Obj << "\n" + << " I: " << *I << "\n" + << " Ptr: " << *RD.Access.getPointer() << "\n"); + return IRB.getFalse(); + } + + Value *FlagLoad = readMAAPVal(LocalMAAPs[Obj], IRB); + LLVM_DEBUG(dbgs() << " FlagLoad " << *FlagLoad << "\n"); + + // If we're dealing with a local race, then don't suppress based on the + // race-type information from the MAAP value. For function arguments, + // that MAAP value reflects potential races via an ancestor, which should + // not disable checking of local races. + Value *LocalCheck; + Value *FlagCheck = IRB.CreateAnd( + FlagLoad, getMAAPIRValue(IRB, RaceTypeToFlagVal(RD.Type))); + LLVM_DEBUG(dbgs() << " FlagCheck " << *FlagCheck << "\n"); + LocalCheck = IRB.CreateICmpEQ(getMAAPIRValue(IRB, 0), FlagCheck); + LLVM_DEBUG(dbgs() << " LocalCheck " << *LocalCheck << "\n"); + + // Add the check. + MAAPChk = IRB.CreateAnd(MAAPChk, LocalCheck); + LLVM_DEBUG(dbgs() << " MAAPChk " << *MAAPChk << "\n"); + + // Get the dynamic no-alias bit from the MAAP value. + Value *NoAliasCheck = IRB.CreateICmpNE( + getMAAPIRValue(IRB, 0), + IRB.CreateAnd( + FlagLoad, + getMAAPIRValue(IRB, static_cast(MAAPValue::NoAlias)))); + + if (RD.Racer.isValid()) { + for (const Value *RObj : RacerObjects) { + LLVM_DEBUG(dbgs() << " Racer Object " << *RObj << "\n"); + // If the racer object matches Obj, there's no need to check a flag. + if (RObj == Obj) { + MAAPChk = IRB.getFalse(); + continue; + } + + // Check if Loc and the racer object may alias. + if (!AA->alias(Loc, MemoryLocation::getBeforeOrAfter(RObj))) + continue; + + if (!LocalMAAPs.count(RObj)) { + LLVM_DEBUG(dbgs() << "No local MAAP found for racer object " << RObj + << "\n"); + MAAPChk = IRB.getFalse(); + continue; + } + + Value *FlagLoad = readMAAPVal(LocalMAAPs[RObj], IRB); + LLVM_DEBUG(dbgs() << " FlagLoad " << *FlagLoad << "\n"); + Value *LocalCheck; + Value *FlagCheck = IRB.CreateAnd( + FlagLoad, getMAAPIRValue(IRB, RaceTypeToFlagVal(RD.Type))); + LLVM_DEBUG(dbgs() << " FlagCheck " << *FlagCheck << "\n"); + LocalCheck = IRB.CreateICmpEQ(getMAAPIRValue(IRB, 0), FlagCheck); + LLVM_DEBUG(dbgs() << " LocalCheck " << *LocalCheck << "\n"); + + // Add the check. + Value *RObjNoAliasFlag = IRB.CreateAnd( + FlagLoad, + getMAAPIRValue(IRB, static_cast(MAAPValue::NoAlias))); + Value *RObjNoAliasCheck = + IRB.CreateICmpNE(getMAAPIRValue(IRB, 0), RObjNoAliasFlag); + MAAPChk = IRB.CreateAnd( + MAAPChk, + IRB.CreateOr(IRB.CreateOr(NoAliasCheck, RObjNoAliasCheck), + LocalCheck)); + } + } + + // Check the function arguments that might alias this object. + for (Argument &Arg : F->args()) { + // Ignore non-pointer arguments + if (!Arg.getType()->isPtrOrPtrVectorTy()) + continue; + // Ignore any arguments that match checked objects. + if (&Arg == Obj) + continue; + // Check if Loc and Arg may alias. + if (!AA->alias(Loc, MemoryLocation::getBeforeOrAfter(&Arg))) + continue; + // If we have no local MAAP information about the argument, give up. + if (!LocalMAAPs.count(&Arg)) { + LLVM_DEBUG(dbgs() << "No local MAAP found for arg " << Arg << "\n"); + return IRB.getFalse(); + } + + LLVM_DEBUG(dbgs() << " Argument " << Arg << "\n"); + + // Incorporate the MAAP value for this argument if we don't have + // a dynamic no-alias bit set. + Value *FlagLoad = readMAAPVal(LocalMAAPs[&Arg], IRB); + Value *FlagCheck; + FlagCheck = IRB.CreateAnd( + FlagLoad, getMAAPIRValue(IRB, RaceTypeToFlagVal(RD.Type))); + Value *LocalCheck = IRB.CreateICmpEQ(getMAAPIRValue(IRB, 0), FlagCheck); + + Value *ArgNoAliasFlag = IRB.CreateAnd( + FlagLoad, + getMAAPIRValue(IRB, static_cast(MAAPValue::NoAlias))); + Value *ArgNoAliasCheck = + IRB.CreateICmpNE(getMAAPIRValue(IRB, 0), ArgNoAliasFlag); + MAAPChk = IRB.CreateAnd( + MAAPChk, IRB.CreateOr(IRB.CreateOr(NoAliasCheck, ArgNoAliasCheck), + LocalCheck)); + } + } + } + return MAAPChk; +} + +bool CilkSanitizerImpl::Instrumentor::PerformDelayedInstrumentation() { + bool Result = false; + // Handle delayed simple instructions + for (Instruction *I : DelayedSimpleInsts) { + assert((RI.mightRaceViaAncestor(I) || RI.mightRaceLocally(I)) && + "Delayed instrumentation is not local race or race via ancestor"); + IRBuilder<> IRB(I); + DebugLoc Loc = searchForDebugLoc(I); + + if (MAAPChecks) { + Value *MAAPChk = getMAAPCheck(I, IRB); + if (MAAPChk != IRB.getFalse()) { + Instruction *CheckTerm = + SplitBlockAndInsertIfThen(IRB.CreateICmpEQ(MAAPChk, IRB.getFalse()), + I, false, nullptr, &DTU, &LI); + IRB.SetInsertPoint(CheckTerm); + } + } + if (Loc) + IRB.SetCurrentDebugLocation(Loc); + if (isa(I) || isa(I)) + Result |= CilkSanImpl.instrumentLoadOrStore(I, IRB); + else if (isa(I) || isa(I)) + Result |= CilkSanImpl.instrumentAtomic(I, IRB); + else + dbgs() << "[Cilksan] Unknown simple instruction: " << *I << "\n"; + } + + // Handle delayed memory intrinsics + for (auto &MemIntrinOp : DelayedMemIntrinsics) { + Instruction *I = MemIntrinOp.first; + assert((RI.mightRaceViaAncestor(I) || RI.mightRaceLocally(I)) && + "Delayed instrumentation is not local race or race via ancestor"); + unsigned OperandNum = MemIntrinOp.second; + IRBuilder<> IRB(I); + DebugLoc Loc = searchForDebugLoc(I); + + if (MAAPChecks) { + Value *MAAPChk = getMAAPCheck(I, IRB, OperandNum); + if (MAAPChk != IRB.getFalse()) { + Instruction *CheckTerm = + SplitBlockAndInsertIfThen(IRB.CreateICmpEQ(MAAPChk, IRB.getFalse()), + I, false, nullptr, &DTU, &LI); + IRB.SetInsertPoint(CheckTerm); + } + } + if (Loc) + IRB.SetCurrentDebugLocation(Loc); + Result |= CilkSanImpl.instrumentAnyMemIntrinAcc(I, OperandNum, IRB); + } + return Result; +} + +// Helper function to walk the hierarchy of tasks containing BasicBlock BB to +// get the top-level task in loop L that contains BB. +static Task *GetTopLevelTaskFor(BasicBlock *BB, Loop *L, TaskInfo &TI) { + Task *T = TI.getTaskFor(BB); + // Return null if we don't find a task for BB contained in L. + if (!T || !L->contains(T->getEntry())) + return nullptr; + + // Walk up the tree of tasks until we discover a task containing BB that is + // outside of L. + while (L->contains(T->getParentTask()->getEntry())) + T = T->getParentTask(); + + return T; +} + +void CilkSanitizerImpl::Instrumentor::GetDetachesForCoalescedInstrumentation( + SmallPtrSetImpl &LoopInstToHoist, + SmallPtrSetImpl &LoopInstToSink) { + // Determine detaches to instrument for the coalesced instrumentation. + for (Instruction *I : LoopInstToHoist) { + Loop *L = LI.getLoopFor(I->getParent()); + // Record the detaches for the loop preheader, where the coalesced + // instrumentation will be inserted. + getDetachesForInstruction(L->getLoopPreheader()->getTerminator()); + } + for (Instruction *I : LoopInstToSink) { + Loop *L = LI.getLoopFor(I->getParent()); + SmallVector ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + for (BasicBlock *ExitBB : ExitBlocks) { + if (GetTopLevelTaskFor(ExitBB, L, TI)) + // Skip any exit blocks in a Tapir task inside the loop. These exit + // blocks lie on exception-handling paths, and to handle these blocks, + // it suffices to insert instrumentation in the unwind destination of + // the corresponding detach, which must also be a loop-exit block. + continue; + + // Record the detaches for the exit block, where the coalesced + // instrumentation will be inserted. + getDetachesForInstruction(ExitBB->getTerminator()); + } + } +} + +bool CilkSanitizerImpl::Instrumentor::InstrumentAncillaryInstructions( + SmallPtrSetImpl &Allocas, + SmallPtrSetImpl &AllocationFnCalls, + SmallPtrSetImpl &FreeCalls, + DenseMap &SyncRegNums, + DenseMap &SRCounters, const DataLayout &DL) { + bool Result = false; + SmallPtrSet Syncs; + SmallPtrSet Loops; + SmallPtrSet InstrumentedAllocFns; + + // Instrument allocas and allocation-function calls that may be involved in a + // race. + for (Instruction *I : Allocas) { + if (CilkSanImpl.ObjectMRForRace.count(I) || + CilkSanImpl.lookupPointerMayBeCaptured(I)) { + CilkSanImpl.instrumentAlloca(I, TI); + getDetachesForInstruction(I); + Result = true; + } + } + for (Instruction *I : AllocationFnCalls) { + // Instrument any allocation-function calls that may allocate memory + // involved in a race. + // + // Note that, without MAAP checks, we must be more conservative about + // considering what memory allocations might be involved in checking for + // races. For example, suppose a function call in a loop uses memory that + // is malloc'd and free'd within that loop. Static analysis might determine + // no race is possible on that memory, but a MAAP check is needed to + // communicate that static information to the function at runtime in order + // to avoid dynamic checks on the same location returned by repeated calls + // to malloc. + + // FIXME: This test won't identify posix_memalign calls as needing + // instrumentation, because posix_memalign modifies a pointer to the pointer + // to the object. + if (!MAAPChecks || CilkSanImpl.ObjectMRForRace.count(I) || + CilkSanImpl.lookupPointerMayBeCaptured(I)) { + CilkSanImpl.instrumentAllocationFn(I, DT, TLI); + InstrumentedAllocFns.insert(I); + getDetachesForInstruction(I); + Result = true; + } + } + for (Instruction *I : FreeCalls) { + // The first argument of the free call is the pointer. + Value *Ptr = I->getOperand(0); + // If the pointer corresponds to an allocation function call in this + // function, or if the pointer is involved in a race, then instrument it. + if (Instruction *PtrI = dyn_cast(Ptr)) { + if (InstrumentedAllocFns.count(PtrI)) { + CilkSanImpl.instrumentFree(I, TLI); + getDetachesForInstruction(I); + Result = true; + continue; + } + } + if (RI.ObjectInvolvedInRace(Ptr) || + CilkSanImpl.unknownObjectUses(Ptr, &LI, TLI)) { + CilkSanImpl.instrumentFree(I, TLI); + getDetachesForInstruction(I); + Result = true; + } + } + + // Instrument detaches + for (DetachInst *DI : Detaches) { + CilkSanImpl.instrumentDetach(DI, SyncRegNums[DI->getSyncRegion()], + SRCounters[DI->getDetached()], DT, TI, LI); + Result = true; + // Get syncs associated with this detach + for (SyncInst *SI : CilkSanImpl.DetachToSync[DI]) + Syncs.insert(SI); + + if (CilkSanImpl.Options.InstrumentLoops) { + // Get any loop associated with this detach. + Loop *L = LI.getLoopFor(DI->getParent()); + if (spawnsTapirLoopBody(DI, LI, TI)) + Loops.insert(L); + } + } + + // Instrument associated syncs + for (SyncInst *SI : Syncs) + CilkSanImpl.instrumentSync(SI, SyncRegNums[SI->getSyncRegion()]); + + if (CilkSanImpl.Options.InstrumentLoops) { + // Recursively instrument all loops + for (Loop *L : Loops) + CilkSanImpl.instrumentTapirLoop(*L, TI, SyncRegNums); + } + + return Result; +} + +// Helper function to get a value for the runtime trip count of the given loop. +static const SCEV *getRuntimeTripCount(Loop &L, ScalarEvolution *SE, + bool IsTapirLoop) { + BasicBlock *Latch = L.getLoopLatch(); + + // The exit count from the latch is sufficient for Tapir loops, because early + // exits from such loops are handled in a special manner. For other loops, we + // use the backedge taken count. + const SCEV *BECountSC = + IsTapirLoop ? SE->getExitCount(&L, Latch) : SE->getBackedgeTakenCount(&L); + if (isa(BECountSC) || + !BECountSC->getType()->isIntegerTy()) { + LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n"); + return SE->getCouldNotCompute(); + } + + // Add 1 since the backedge count doesn't include the first loop iteration. + const SCEV *TripCountSC = + SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); + if (isa(TripCountSC)) { + LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n"); + return SE->getCouldNotCompute(); + } + + return TripCountSC; +} + +// Helper function to find where in the given basic block to insert coalesced +// instrumentation. +static Instruction *getLoopBlockInsertPt(BasicBlock *BB, FunctionCallee LoopHook, + bool AfterHook) { + // BasicBlock *PreheaderBB = L->getLoopPreheader(); + for (Instruction &I : *BB) + if (CallBase *CB = dyn_cast(&I)) + if (const Function *Called = CB->getCalledFunction()) + if (Called == LoopHook.getCallee()) { + // We found a call to the specified hook. Pick an insertion point + // with respect to it. + if (AfterHook) + return &*CB->getIterator()->getNextNode(); + else + return CB; + } + + if (AfterHook) + return &*BB->getFirstInsertionPt(); + else + return BB->getTerminator(); +} + +// TODO: Maybe to avoid confusion with CilkSanImpl.Options.InstrumentLoops +// (which is unrelated to this), rename this to involve the word "hoist" or something. +bool CilkSanitizerImpl::Instrumentor::InstrumentLoops( + SmallPtrSetImpl &LoopInstToHoist, + SmallPtrSetImpl &LoopInstToSink, + SmallPtrSetImpl &TapirLoops, ScalarEvolution *SE) { + bool Result = false; + + // First insert computation for the hook arguments for all instructions to + // hoist or sink coalesced instrumentation. We do this before inserting the + // hook calls themselves, so that changes to the CFG -- specifically, from + // inserting MAAP checks -- do not disrupt any function analyses we need. + + // Map instructions in the loop to address and range arguments for coalesced + // instrumentation. + DenseMap> HoistedHookArgs; + // Compute arguments for coalesced instrumentation hoisted to before the loop. + for (Instruction *I : LoopInstToHoist) { + // Get the insertion point in the preheader of the loop. + Loop *L = LI.getLoopFor(I->getParent()); + assert(L->getLoopPreheader() && "No preheader for loop"); + Instruction *InsertPt = + getLoopBlockInsertPt(L->getLoopPreheader(), CilkSanImpl.CsanBeforeLoop, + /*AfterHook*/ false); + + // TODO: Unify this SCEV computation with the similar computation for + // instructions in LoopInstToSink. + + // Get the SCEV describing this instruction's pointer + const SCEV *V = SE->getSCEV(getLoadStorePointerOperand(I)); + const SCEVAddRecExpr *SrcAR = dyn_cast(V); + + // Get the stride + const SCEV *StrideExpr = SrcAR->getStepRecurrence(*SE); + assert(!isa(StrideExpr) && + "Stride should be computable"); + bool NegativeStride = SE->isKnownNegative(StrideExpr); + if (NegativeStride) + StrideExpr = SE->getNegativeSCEV(StrideExpr); + + // Get the first address accessed. + const SCEV *FirstAddr = SrcAR->getStart(); + + // Get the last address accessed. + BasicBlock *Latch = L->getLoopLatch(); + const SCEV *BECount = TapirLoops.count(L) ? SE->getExitCount(L, Latch) + : SE->getBackedgeTakenCount(L); + const SCEV *LastAddr = SrcAR->evaluateAtIteration(BECount, *SE); + + // Get the size (number of bytes) of the address range accessed. + const SCEV *RangeExpr = NegativeStride + ? SE->getMinusSCEV(FirstAddr, LastAddr) + : SE->getMinusSCEV(LastAddr, FirstAddr); + RangeExpr = SE->getAddExpr(RangeExpr, StrideExpr); + + // Get the start (lowest address) of the address range accessed. + const SCEV *Addr = NegativeStride ? LastAddr : FirstAddr; + + // Get instructions for calculating address range + const DataLayout &DL = CilkSanImpl.M.getDataLayout(); + LLVMContext &Ctx = CilkSanImpl.M.getContext(); + SCEVExpander Expander(*SE, DL, "cilksan"); + + Value *AddrVal = + Expander.expandCodeFor(Addr, Type::getInt8PtrTy(Ctx), InsertPt); + Value *RangeVal = + Expander.expandCodeFor(RangeExpr, Type::getInt64Ty(Ctx), InsertPt); + HoistedHookArgs[I] = std::make_pair(AddrVal, RangeVal); + } + + // Map pairs of instruction and loop-exit to address and range arguments for + // coalesced instrumentation. + DenseMap, std::pair> + SunkHookArgs; + // Map to track which loops we have already created counters for + SmallMapVector LoopToCounterMap; + // Compute arguments for coalesced instrumentation sunk after the loop. + for (Instruction *I : LoopInstToSink) { + // Get the loop + Loop *L = LI.getLoopFor(I->getParent()); + + // Add a counter to count the number of iterations executed in this loop. + // In particular, this count will record the number of times the backedge of + // the loop is taken. + if (!LoopToCounterMap.count(L)) { + assert(L->getLoopPreheader() && "No preheader for loop"); + assert(L->getLoopLatch() && "No unique latch for loop"); + IRBuilder<> IRB(&L->getHeader()->front()); + LLVMContext &Ctx = CilkSanImpl.M.getContext(); + + PHINode *PN = IRB.CreatePHI(Type::getInt64Ty(Ctx), 2); + PN->addIncoming(ConstantInt::getNullValue(Type::getInt64Ty(Ctx)), + L->getLoopPreheader()); + IRB.SetInsertPoint(&*L->getLoopLatch()->getFirstInsertionPt()); + Value *Add = IRB.CreateAdd(PN, ConstantInt::get(Type::getInt64Ty(Ctx), 1), + "", true, true); + PN->addIncoming(Add, L->getLoopLatch()); + LoopToCounterMap.insert(std::make_pair(L, PN)); + } + + // Get the counter for this loop. + Value *Counter = LoopToCounterMap[L]; + + // Get the SCEV describing this instruction's pointer + const SCEV *V = SE->getSCEV(getLoadStorePointerOperand(I)); + const SCEVAddRecExpr *SrcAR = dyn_cast(V); + + // Get the stride + const SCEV *StrideExpr = SrcAR->getStepRecurrence(*SE); + assert(!isa(StrideExpr) && + "Stride should be computable"); + bool NegativeStride = SE->isKnownNegative(StrideExpr); + if (NegativeStride) + StrideExpr = SE->getNegativeSCEV(StrideExpr); + + // Get the first address accessed. + const SCEV *FirstAddr = SrcAR->getStart(); + + // Get the last address accessed, based on the counter value.. + const SCEV *BECount = SE->getUnknown(Counter); + const SCEV *LastAddr = SrcAR->evaluateAtIteration(BECount, *SE); + + // Get the size (number of bytes) of the address range accessed. + const SCEV *RangeExpr = NegativeStride + ? SE->getMinusSCEV(FirstAddr, LastAddr) + : SE->getMinusSCEV(LastAddr, FirstAddr); + RangeExpr = SE->getAddExpr(RangeExpr, StrideExpr); + // Get the start (lowest address) of the address range accessed. + const SCEV *Addr = NegativeStride ? LastAddr : FirstAddr; + + // Expand SCEV's into instructions for calculating the coalesced hook + // arguments in each exit block. + LLVMContext &Ctx = CilkSanImpl.M.getContext(); + const DataLayout &DL = CilkSanImpl.M.getDataLayout(); + SCEVExpander Expander(*SE, DL, "cilksan"); + SmallVector ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + for (BasicBlock *ExitBB : ExitBlocks) { + if (GetTopLevelTaskFor(ExitBB, L, TI)) + // Skip any exit blocks in a Tapir task inside the loop. These exit + // blocks lie on exception-handling paths, and to handle these blocks, + // it suffices to insert instrumentation in the unwind destination of + // the corresponding detach, which must also be a loop-exit block. + continue; + + // Instruction *InsertPt = &*ExitBB->getFirstInsertionPt(); + Instruction *InsertPt = + getLoopBlockInsertPt(ExitBB, CilkSanImpl.CsanAfterLoop, + /*AfterHook*/ true); + Value *AddrVal = + Expander.expandCodeFor(Addr, Type::getInt8PtrTy(Ctx), InsertPt); + Value *RangeVal = + Expander.expandCodeFor(RangeExpr, Type::getInt64Ty(Ctx), InsertPt); + + assert(isa(RangeVal) && + "Expected computation of RangeVal to produce an instruction."); + SunkHookArgs[std::make_pair(I, ExitBB)] = + std::make_pair(AddrVal, RangeVal); + } + } + + // Now insert coalesced instrumentation, including relevant MAAP checks. + // + // TODO: For now, we only handle LoadInst and StoreInst. Add other operations + // later, such as atomics and memory intrinsics. + + // Insert coalesced instrumentation hoisted before the loop. + for (Instruction *I : LoopInstToHoist) { + LLVM_DEBUG(dbgs() << "Loop instruction for hoisting instrumentation: " << *I + << "\n"); + + // Get the local ID of this instruction. + uint64_t LocalId; + if (LoadInst *LI = dyn_cast(I)) { + uint64_t LoadId = CilkSanImpl.LoadFED.add(*LI); + + // TODO: Don't recalculate underlying objects + uint64_t LoadObjId = CilkSanImpl.LoadObj.add( + *LI, + CilkSanImpl.lookupUnderlyingObject(getLoadStorePointerOperand(LI))); + assert(LoadId == LoadObjId && + "Load received different ID's in FED and object tables."); + LocalId = LoadId; + // Update the statistic here, since we're guaranteed to insert the hook at + // this point. + ++NumHoistedInstrumentedReads; + } else if (StoreInst *SI = dyn_cast(I)) { + uint64_t StoreId = CilkSanImpl.StoreFED.add(*SI); + + // TODO: Don't recalculate underlying objects + uint64_t StoreObjId = CilkSanImpl.StoreObj.add( + *SI, + CilkSanImpl.lookupUnderlyingObject(getLoadStorePointerOperand(SI))); + assert(StoreId == StoreObjId && + "Store received different ID's in FED and object tables."); + LocalId = StoreId; + // Update the statistic here, since we're guaranteed to insert the hook at + // this point. + ++NumHoistedInstrumentedWrites; + } else + llvm_unreachable("Unexpected instruction to hoist instrumentation."); + + // For now, there shouldn't be a reason to return false since we already + // verified the size, stride, and tripcount. + Loop *L = LI.getLoopFor(I->getParent()); + Instruction *InsertPt = + getLoopBlockInsertPt(L->getLoopPreheader(), CilkSanImpl.CsanBeforeLoop, + /*AfterLoop*/ false); + IRBuilder<> IRB(InsertPt); + if (MAAPChecks) { + Value *MAAPChk = getMAAPCheck(I, IRB); + if (MAAPChk != IRB.getFalse()) { + Instruction *CheckTerm = + SplitBlockAndInsertIfThen(IRB.CreateICmpEQ(MAAPChk, IRB.getFalse()), + InsertPt, false, nullptr, &DTU, &LI); + IRB.SetInsertPoint(CheckTerm); + } + } + IRB.SetCurrentDebugLocation(searchForDebugLoc(I)); + CilkSanImpl.instrumentLoadOrStoreHoisted( + I, HoistedHookArgs[I].first, HoistedHookArgs[I].second, IRB, LocalId); + Result = true; + } + + // Insert coalesced instrumentation sunk after the loop. + for (Instruction *I : LoopInstToSink) { + LLVM_DEBUG(dbgs() << "Loop instruction for sinking instrumentation: " << *I + << "\n"); + Loop *L = LI.getLoopFor(I->getParent()); + + // Get the local ID of this instruction. We do this computation early to + // avoid recomputing the local ID once per exit block. + uint64_t LocalId; + if (LoadInst *LI = dyn_cast(I)) { + uint64_t LoadId = CilkSanImpl.LoadFED.add(*LI); + + // TODO: Don't recalculate underlying objects + uint64_t LoadObjId = CilkSanImpl.LoadObj.add( + *LI, + CilkSanImpl.lookupUnderlyingObject(getLoadStorePointerOperand(LI))); + assert(LoadId == LoadObjId && + "Load received different ID's in FED and object tables."); + LocalId = LoadId; + // Update the statistic here, since we're guaranteed to insert the hooks + // at this point, and to avoid overcounting the number of instructions on + // loops with multiple exits. + ++NumSunkInstrumentedReads; + } else if (StoreInst *SI = dyn_cast(I)) { + uint64_t StoreId = CilkSanImpl.StoreFED.add(*SI); + + // TODO: Don't recalculate underlying objects + uint64_t StoreObjId = CilkSanImpl.StoreObj.add( + *SI, + CilkSanImpl.lookupUnderlyingObject(getLoadStorePointerOperand(SI))); + assert(StoreId == StoreObjId && + "Store received different ID's in FED and object tables."); + LocalId = StoreId; + // Update the statistic here, since we're guaranteed to insert the hooks + // at this point, and to avoid overcounting the number of instructions on + // loops with multiple exits. + ++NumSunkInstrumentedWrites; + } else + llvm_unreachable("Unexpected instruction to sink instrumentation."); + + SmallVector ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + for (BasicBlock *ExitBB : ExitBlocks) { + if (GetTopLevelTaskFor(ExitBB, L, TI)) + // Skip any exit blocks in a Tapir task inside the loop. These exit + // blocks lie on exception-handling paths, and to handle these blocks, + // it suffices to insert instrumentation in the unwind destination of + // the corresponding detach, which must also be a loop-exit block. + continue; + + // After the loop, perform the coalesced read/write. + auto HookArgsKey = std::make_pair(I, ExitBB); + + // Insert the hook call after the computation of RangeVal. + Instruction *InsertPt = + cast(SunkHookArgs[HookArgsKey].second) + ->getIterator() + ->getNextNode(); + IRBuilder<> IRB(&*InsertPt); + if (MAAPChecks) { + Value *MAAPChk = getMAAPCheck(I, IRB); + if (MAAPChk != IRB.getFalse()) { + Instruction *CheckTerm = SplitBlockAndInsertIfThen( + IRB.CreateICmpEQ(MAAPChk, IRB.getFalse()), &*InsertPt, false, + nullptr, &DTU, &LI); + IRB.SetInsertPoint(CheckTerm); + } + } + IRB.SetCurrentDebugLocation(searchForDebugLoc(I)); + CilkSanImpl.instrumentLoadOrStoreHoisted( + I, SunkHookArgs[HookArgsKey].first, SunkHookArgs[HookArgsKey].second, + IRB, LocalId); + Result = true; + } + } + + return Result; +} + +bool CilkSanitizerImpl::instrumentLoadOrStoreHoisted(Instruction *I, + Value *Addr, Value *Size, + IRBuilder<> &IRB, + uint64_t LocalId) { + // The caller of this method is guaranteed to have computed the Addr and Size + // values with the right type for the hook, so no additional type conversions + // are needed. + CsiLoadStoreProperty Prop; + if (LoadInst *LI = dyn_cast(I)) { + Prop.setAlignment(MaybeAlign(LI->getAlign())); + Prop.setIsThreadLocal(isThreadLocalObject(lookupUnderlyingObject(Addr))); + // Instrument the load + Value *CsiId = LoadFED.localToGlobalId(LocalId, IRB); + Value *Args[] = {CsiId, Addr, Size, Prop.getValue(IRB)}; + Instruction *Call = IRB.CreateCall(CsanLargeRead, Args); + IRB.SetInstDebugLocation(Call); + } else if (StoreInst *SI = dyn_cast(I)) { + Prop.setAlignment(SI->getAlign()); + Prop.setIsThreadLocal(isThreadLocalObject(lookupUnderlyingObject(Addr))); + // Instrument the store + Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB); + Value *Args[] = {CsiId, Addr, Size, Prop.getValue(IRB)}; + Instruction *Call = IRB.CreateCall(CsanLargeWrite, Args); + IRB.SetInstDebugLocation(Call); + } + return true; +} + +static bool CheckSanitizeCilkAttr(Function &F) { + if (IgnoreSanitizeCilkAttr) + return true; + return F.hasFnAttribute(Attribute::SanitizeCilk); +} + +bool CilkSanitizerImpl::setupFunction(Function &F, bool NeedToSetupCalls) { + if (F.empty() || shouldNotInstrumentFunction(F) || + LinkedFromBitcode.count(&F) || !CheckSanitizeCilkAttr(F)) { + LLVM_DEBUG({ + dbgs() << "Skipping " << F.getName() << "\n"; + if (F.empty()) + dbgs() << " Empty function\n"; + else if (shouldNotInstrumentFunction(F)) + dbgs() << " Function should not be instrumented\n"; + else if (LinkedFromBitcode.count(&F)) + dbgs() << " Function from linked-in bitcode\n"; + else if (!CheckSanitizeCilkAttr(F)) + dbgs() << " Function lacks sanitize_cilk attribute\n"; + }); + return false; + } + + LLVM_DEBUG(dbgs() << "Setting up " << F.getName() + << " for instrumentation\n"); + + // TODO: Move these steps into csi-setup pass. + + if (NeedToSetupCalls && Options.CallsMayThrow) + // Promote calls to invokes to insert instrumentation in exception-handling + // code. + setupCalls(F); + + DominatorTree &DT = GetDomTree(F); + LoopInfo &LI = GetLoopInfo(F); + + if (Options.InstrumentLoops) + // Simplify loops to prepare for loop instrumentation + for (Loop *L : LI) + simplifyLoop(L, &DT, &LI, nullptr, nullptr, nullptr, + /* PreserveLCSSA */ false); + + // Canonicalize the CFG for instrumentation. + setupBlocks(F, &DT, &LI); + + return true; +} + +/// Set DebugLoc on the call instruction to a CSI hook, based on the +/// debug information of the instrumented instruction. +static void setInstrumentationDebugLoc(Function &Instrumented, + Instruction *Call) { + DISubprogram *Subprog = Instrumented.getSubprogram(); + if (Subprog) { + LLVMContext &C = Instrumented.getParent()->getContext(); + Call->setDebugLoc(DILocation::get(C, 0, 0, Subprog)); + } +} + +bool CilkSanitizerImpl::instrumentFunctionUsingRI(Function &F) { + + if (F.empty() || shouldNotInstrumentFunction(F) || + !CheckSanitizeCilkAttr(F)) { + LLVM_DEBUG({ + dbgs() << "Skipping " << F.getName() << "\n"; + if (F.empty()) + dbgs() << " Empty function\n"; + else if (shouldNotInstrumentFunction(F)) + dbgs() << " Function should not be instrumented\n"; + else if (!CheckSanitizeCilkAttr(F)) + dbgs() << " Function lacks sanitize_cilk attribute\n";}); + return false; + } + + LLVM_DEBUG(dbgs() << "Instrumenting " << F.getName() << "\n"); + + SmallVector AllLoadsAndStores; + SmallVector LocalLoadsAndStores; + SmallVector AtomicAccesses; + SmallVector MemIntrinCalls; + SmallVector IntrinsicCalls; + SmallVector LibCalls; + SmallVector Callsites; + // Ancillary instructions + SmallPtrSet Allocas; + SmallPtrSet AllocationFnCalls; + SmallPtrSet FreeCalls; + SmallVector Syncs; + DenseMap SRCounters; + DenseMap SyncRegNums; + + // Find instructions that can be hoisted or sinked + SmallPtrSet LoopInstToHoist; + SmallPtrSet LoopInstToSink; + SmallPtrSet TapirLoops; + + const TargetLibraryInfo *TLI = &GetTLI(F); + DominatorTree &DT = GetDomTree(F); + LoopInfo &LI = GetLoopInfo(F); + TaskInfo &TI = GetTaskInfo(F); + RaceInfo &RI = GetRaceInfo(F); + + ICFLoopSafetyInfo SafetyInfo; + + ScalarEvolution &SE = *(RI.getSE()); + + for (BasicBlock &BB : F) { + // Record the Tapir sync instructions found + if (SyncInst *SI = dyn_cast(BB.getTerminator())) + Syncs.push_back(SI); + + // get loop for BB + Loop *L = LI.getLoopFor(&BB); + if (L) + SafetyInfo.computeLoopSafetyInfo(L); + + // Record the memory accesses in the basic block + for (Instruction &Inst : BB) { + bool CanCoalesce = false; + // If the instruction is in a loop and can only race via ancestor, and + // size < stride, store it. + if (L && EnableStaticRaceDetection && LoopHoisting && + SafetyInfo.isGuaranteedToExecute(Inst, &DT, &TI, L)) { + // TODO: For now, only look at loads and stores. Add atomics later. + // Need to add any others? + if (isa(Inst) || isa(Inst)) { + bool RaceViaAncestor = false; + bool OtherRace = false; + for (const RaceInfo::RaceData &RD : RI.getRaceData(&Inst)) { + if (RaceInfo::isRaceViaAncestor(RD.Type)) { + RaceViaAncestor = true; + } else if (RaceInfo::isOpaqueRace(RD.Type)) { + LLVM_DEBUG(dbgs() << "Can't hoist or sink instrumentation for " + << Inst << "\n Opaque race.\n"); + OtherRace = true; + break; + } else if (RaceInfo::isLocalRace(RD.Type)) { + if (!RD.Racer.isValid()) { + LLVM_DEBUG(dbgs() + << "Can't hoist or sink instrumentation for " << Inst + << "\n Local race with opaque racer.\n"); + OtherRace = true; + break; + } else if (LI.getLoopFor(RD.Racer.I->getParent()) == L) { + LLVM_DEBUG(dbgs() + << "Can't hoist or sink instrumentation for " << Inst + << "\n Local race with racer in same loop: " + << *RD.Racer.I << "\n"); + OtherRace = true; + break; + } + RaceViaAncestor = true; + } + } + // If this instruction can only race via an ancestor, see if it can be + // hoisted. + if (RaceViaAncestor && !OtherRace) { + const SCEV *Size = SE.getElementSize(&Inst); + const SCEV *V = SE.getSCEV(getLoadStorePointerOperand(&Inst)); + // If not an AddRecExpr, don't proceed + if (const SCEVAddRecExpr *SrcAR = dyn_cast(V)) { + const SCEV *Stride = SrcAR->getStepRecurrence(SE); + const SCEV *Diff; + if (SE.isKnownNonNegative(Stride)) { + Diff = SE.getMinusSCEV(Size, Stride); + } else { + // If we can't compare size and stride, + // SE.isKnownNonNegative(Diff) will be false. + Diff = SE.getAddExpr(Size, Stride); + } + bool isTapirLoop = static_cast(getTaskIfTapirLoop(L, &TI)); + if (isTapirLoop) + TapirLoops.insert(L); + const SCEV *TripCount = getRuntimeTripCount(*L, &SE, isTapirLoop); + + if (SE.isKnownNonNegative(Diff)) { + if (!isa(TripCount) && + SE.isAvailableAtLoopEntry(SrcAR->getStart(), L)) { + // Can hoist if stride <= size and the tripcount is known and + // the start is available at loop entry. + LoopInstToHoist.insert(&Inst); + CanCoalesce = true; + LLVM_DEBUG(dbgs() << "Can hoist instrumentation for " << Inst << "\n"); + } else if (!isa( + SE.getConstantMaxBackedgeTakenCount(L))) { + // Can sink if stride <= size and the tripcount is unknown but + // guaranteed to be finite. + LoopInstToSink.insert(&Inst); + CanCoalesce = true; + LLVM_DEBUG(dbgs() << "Can sink instrumentation for " << Inst << "\n"); + } else { + LLVM_DEBUG(dbgs() + << "Can't hoist or sink instrumentation for " + << Inst << "\n TripCount = " << *TripCount + << "\n SrcAR->getStart() = " << *SrcAR->getStart() + << "\n SE.getConstantMaxBackedgeTakenCount(L) = " + << *SE.getConstantMaxBackedgeTakenCount(L) + << "\n"); + } + } else { + LLVM_DEBUG(dbgs() << "Can't hoist instrumentation for " << Inst + << "\n Diff SCEV not known non-negative: " + << *Diff << "\n"); + } + } else { + LLVM_DEBUG( + dbgs() + << "Can't hoist or sink instrumentation for " << Inst + << "\n SCEV for load/store pointer operand not AddRecExpr: " + << *V << ": " << V->getSCEVType() << "\n"); + } + } + } + } + + if (!CanCoalesce) { + // TODO: Handle VAArgInst + if (isa(Inst) || isa(Inst)) + LocalLoadsAndStores.push_back(&Inst); + else if (isa(Inst) || isa(Inst)) + AtomicAccesses.push_back(&Inst); + else if (isa(Inst)) + Allocas.insert(&Inst); + else if (isa(Inst)) { + // if (CallInst *CI = dyn_cast(&Inst)) + // maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI); + + // If we find a sync region, record it. + if (const IntrinsicInst *II = dyn_cast(&Inst)) + if (Intrinsic::syncregion_start == II->getIntrinsicID()) { + // Identify this sync region with a counter value, where all sync + // regions within a function or task are numbered from 0. + if (TI.getTaskFor(&BB)) { + BasicBlock *TEntry = TI.getTaskFor(&BB)->getEntry(); + // Create a new counter if need be. + if (!SRCounters.count(TEntry)) + SRCounters[TEntry] = 0; + SyncRegNums[&Inst] = SRCounters[TEntry]++; + } + } + + // Record this function call as either an allocation function, a call to + // free (or delete), a memory intrinsic, or an ordinary real function + // call. + if (isAllocFn(&Inst, TLI)) + AllocationFnCalls.insert(&Inst); + else if (isFreeFn(&Inst, TLI)) + FreeCalls.insert(&Inst); + else if (isa(Inst)) + MemIntrinCalls.push_back(&Inst); + else if (!simpleCallCannotRace(Inst) && !shouldIgnoreCall(Inst)) { + if (isa(&Inst)) { + if (Inst.mayReadOrWriteMemory()) + IntrinsicCalls.push_back(&Inst); + } else if (isLibCall(Inst, TLI)) { + if (Inst.mayReadOrWriteMemory()) + LibCalls.push_back(&Inst); + } else { + Callsites.push_back(&Inst); + } + } + } + + // Add the current set of local loads and stores to be considered for + // instrumentation. + if (!simpleCallCannotRace(Inst)) { + chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, + TI, LI, TLI); + } + } + } + chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, TI, + LI, TLI); + } + + // Evaluate the tasks that might be in parallel with each spindle. + MaybeParallelTasks MPTasks; + TI.evaluateParallelState(MPTasks); + + // Map each detach instruction with the sync instructions that could sync it. + for (SyncInst *Sync : Syncs) + for (const Task *MPT : + MPTasks.TaskList[TI.getSpindleFor(Sync->getParent())]) + DetachToSync[MPT->getDetach()].push_back(Sync); + + // Record objects involved in races + for (auto &ObjRD : RI.getObjectMRForRace()) + ObjectMRForRace[ObjRD.first] = ObjRD.second; + + uint64_t LocalId = getLocalFunctionID(F); + IRBuilder<> IRB(getEntryBBInsertPt(F.getEntryBlock())); + Value *FuncId = FunctionFED.localToGlobalId(LocalId, IRB); + + bool Result = false; + if (!EnableStaticRaceDetection) { + SimpleInstrumentor FuncI(*this, TI, LI, DT, TLI); + Result |= FuncI.InstrumentSimpleInstructions(AllLoadsAndStores); + Result |= FuncI.InstrumentSimpleInstructions(AtomicAccesses); + Result |= FuncI.InstrumentAnyMemIntrinsics(MemIntrinCalls); + Result |= FuncI.InstrumentCalls(IntrinsicCalls); + Result |= FuncI.InstrumentCalls(LibCalls); + Result |= FuncI.InstrumentCalls(Callsites); + + // Instrument ancillary instructions including allocas, allocation-function + // calls, free calls, detaches, and syncs. + Result |= FuncI.InstrumentAncillaryInstructions(Allocas, AllocationFnCalls, + FreeCalls, SyncRegNums, + SRCounters, DL); + } else { + Instrumentor FuncI(*this, RI, TI, LI, DT, TLI); + + // Insert MAAP flags for each function argument. + FuncI.InsertArgMAAPs(F, FuncId); + + Result |= FuncI.InstrumentSimpleInstructions(AllLoadsAndStores); + Result |= FuncI.InstrumentSimpleInstructions(AtomicAccesses); + Result |= FuncI.InstrumentAnyMemIntrinsics(MemIntrinCalls); + Result |= FuncI.InstrumentCalls(IntrinsicCalls); + Result |= FuncI.InstrumentCalls(LibCalls); + Result |= FuncI.InstrumentCalls(Callsites); + + // Find detaches that need to be instrumented for loop instructions whose + // instrumentation will be coalesced. + FuncI.GetDetachesForCoalescedInstrumentation(LoopInstToHoist, + LoopInstToSink); + + // Instrument ancillary instructions including allocas, allocation-function + // calls, free calls, detaches, and syncs. + Result |= FuncI.InstrumentAncillaryInstructions(Allocas, AllocationFnCalls, + FreeCalls, SyncRegNums, + SRCounters, DL); + + // Hoist and sink instrumentation when possible (applies to all loops, + // not just Tapir loops) + // Also inserts MAAP checks for hoisted/sinked instrumentation + Result |= + FuncI.InstrumentLoops(LoopInstToHoist, LoopInstToSink, TapirLoops, &SE); + + // Once we have handled ancillary instructions, we've done the necessary + // analysis on this function. We now perform delayed instrumentation, which + // can involve changing the CFG and thereby violating some analyses. + Result |= FuncI.PerformDelayedInstrumentation(); + } + + if (Result) { + bool MaySpawn = !TI.isSerial(); + if (InstrumentationSet & SERIESPARALLEL) { + IRBuilder<> IRB(cast(FuncId)->getNextNode()); + CsiFuncProperty FuncEntryProp; + FuncEntryProp.setMaySpawn(MaySpawn); + if (MaySpawn) + FuncEntryProp.setNumSyncReg(SRCounters[TI.getRootTask()->getEntry()]); + // TODO: Determine if we actually want the frame pointer, not the stack + // pointer. + Value *FrameAddr = + IRB.CreateCall(Intrinsic::getDeclaration(&M, Intrinsic::frameaddress, + IRB.getInt8PtrTy()), + {IRB.getInt32(0)}); + Value *StackSave = + IRB.CreateCall(Intrinsic::getDeclaration(&M, Intrinsic::stacksave)); + CallInst *EntryCall = + IRB.CreateCall(CsanFuncEntry, {FuncId, FrameAddr, StackSave, + FuncEntryProp.getValue(IRB)}); + setInstrumentationDebugLoc(F, EntryCall); + } else { + // Search for a call to CsanFuncEntry, and update its ID argument. + for (BasicBlock::iterator I = cast(FuncId)->getIterator(), + E = F.getEntryBlock().end(); + I != E; ++I) { + if (CallBase *CB = dyn_cast(&*I)) + if (CB->getCalledFunction() == CsanFuncEntry.getCallee()) { + CB->setArgOperand(0, FuncId); + break; + } + } + } + + EscapeEnumerator EE(F, "csan_cleanup", false); + while (IRBuilder<> *AtExit = EE.Next()) { + if (InstrumentationSet & SERIESPARALLEL) { + uint64_t ExitLocalId = FunctionExitFED.add(*AtExit->GetInsertPoint()); + Value *ExitCsiId = FunctionExitFED.localToGlobalId(ExitLocalId, *AtExit); + CsiFuncExitProperty FuncExitProp; + FuncExitProp.setMaySpawn(MaySpawn); + FuncExitProp.setEHReturn(isa(AtExit->GetInsertPoint())); + CallInst *ExitCall = AtExit->CreateCall( + CsanFuncExit, {ExitCsiId, FuncId, FuncExitProp.getValue(*AtExit)}); + setInstrumentationDebugLoc(F, ExitCall); + } else { + // Search for a call to CsanFuncExit, and update its ID argument. + for (BasicBlock::iterator I = AtExit->GetInsertBlock()->begin(), + E = AtExit->GetInsertBlock()->end(); + I != E; ++I) { + if (CallBase *CB = dyn_cast(&*I)) + if (CB->getCalledFunction() == CsanFuncExit.getCallee()) { + CB->setArgOperand(1, FuncId); + break; + } + } + } + } + } + + // Record aggregate race information for the function and its arguments for + // interprocedural analysis. + // + // TODO: Clean this up + RaceInfo::RaceType FuncRT = RaceInfo::None; + for (Instruction *I : AllLoadsAndStores) + FuncRT = RaceInfo::unionRaceTypes(FuncRT, RI.getRaceType(I)); + for (Instruction *I : AtomicAccesses) + FuncRT = RaceInfo::unionRaceTypes(FuncRT, RI.getRaceType(I)); + for (Instruction *I : MemIntrinCalls) + FuncRT = RaceInfo::unionRaceTypes(FuncRT, RI.getRaceType(I)); + for (Instruction *I : Callsites) { + if (const CallBase *CB = dyn_cast(I)) { + // Use updated information about the race type of the call, if it's + // available. + const Function *CF = CB->getCalledFunction(); + if (FunctionRaceType.count(CF)) { + FuncRT = RaceInfo::unionRaceTypes(FuncRT, FunctionRaceType[CF]); + // Preserve the local-race marking if the callsite itself is involved in + // a local race. + if (RaceInfo::isLocalRace(RI.getRaceType(I))) + FuncRT = RaceInfo::unionRaceTypes(FuncRT, RaceInfo::Local); + continue; + } + } + FuncRT = RaceInfo::unionRaceTypes(FuncRT, RI.getRaceType(I)); + } + FunctionRaceType[&F] = FuncRT; + + return Result; +} + +bool CilkSanitizerImpl::instrumentLoadOrStore(Instruction *I, + IRBuilder<> &IRB) { + bool IsWrite = isa(*I); + Value *Addr = IsWrite + ? cast(I)->getPointerOperand() + : cast(I)->getPointerOperand(); + Type *Ty = + IsWrite ? cast(I)->getValueOperand()->getType() : I->getType(); + + // swifterror memory addresses are mem2reg promoted by instruction selection. + // As such they cannot have regular uses like an instrumentation function and + // it makes no sense to track them as memory. + if (Addr->isSwiftError()) + return false; + + int NumBytesAccessed = getNumBytesAccessed(Ty, DL); + if (-1 == NumBytesAccessed) { + // Ignore accesses with bad sizes. + NumAccessesWithBadSize++; + return false; + } + + // Only insert instrumentation if requested + if (!(InstrumentationSet & SHADOWMEMORY)) + return true; + + const Align Alignment = IsWrite + ? cast(I)->getAlign() + : cast(I)->getAlign(); + CsiLoadStoreProperty Prop; + Prop.setAlignment(Alignment); + Prop.setIsAtomic(I->isAtomic()); + Prop.setIsThreadLocal(isThreadLocalObject(lookupUnderlyingObject(Addr))); + if (IsWrite) { + // Instrument store + uint64_t LocalId = StoreFED.add(*I); + uint64_t StoreObjId = StoreObj.add(*I, lookupUnderlyingObject(Addr)); + assert(LocalId == StoreObjId && + "Store received different ID's in FED and object tables."); + Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB); + Value *Args[] = {CsiId, + IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + IRB.getInt32(NumBytesAccessed), + Prop.getValue(IRB)}; + Instruction *Call = IRB.CreateCall(CsanWrite, Args); + IRB.SetInstDebugLocation(Call); + NumInstrumentedWrites++; + } else { + // Instrument load + uint64_t LocalId = LoadFED.add(*I); + uint64_t LoadObjId = LoadObj.add(*I, lookupUnderlyingObject(Addr)); + assert(LocalId == LoadObjId && + "Load received different ID's in FED and object tables."); + Value *CsiId = LoadFED.localToGlobalId(LocalId, IRB); + Value *Args[] = {CsiId, + IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + IRB.getInt32(NumBytesAccessed), + Prop.getValue(IRB)}; + Instruction *Call = IRB.CreateCall(CsanRead, Args); + IRB.SetInstDebugLocation(Call); + NumInstrumentedReads++; + } + return true; +} + +bool CilkSanitizerImpl::instrumentAtomic(Instruction *I, IRBuilder<> &IRB) { + Value *Addr; + Type *Ty; + Align Alignment; + if (AtomicRMWInst *RMWI = dyn_cast(I)) { + Addr = RMWI->getPointerOperand(); + Ty = RMWI->getValOperand()->getType(); + Alignment = RMWI->getAlign(); + } else if (AtomicCmpXchgInst *CASI = dyn_cast(I)) { + Addr = CASI->getPointerOperand(); + Ty = CASI->getNewValOperand()->getType(); + Alignment = CASI->getAlign(); + } else { + return false; + } + + int NumBytesAccessed = getNumBytesAccessed(Ty, DL); + if (-1 == NumBytesAccessed) { + // Ignore accesses with bad sizes. + NumAccessesWithBadSize++; + return false; + } + + // Only insert instrumentation if requested + if (!(InstrumentationSet & SHADOWMEMORY)) + return true; + + CsiLoadStoreProperty Prop; + Prop.setAlignment(Alignment); + Prop.setIsAtomic(true); + Prop.setIsThreadLocal(isThreadLocalObject(lookupUnderlyingObject(Addr))); + uint64_t LocalId = StoreFED.add(*I); + uint64_t StoreObjId = StoreObj.add(*I, lookupUnderlyingObject(Addr)); + assert(LocalId == StoreObjId && + "Store received different ID's in FED and object tables."); + Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB); + Value *Args[] = {CsiId, + IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + IRB.getInt32(NumBytesAccessed), + Prop.getValue(IRB)}; + Instruction *Call = IRB.CreateCall(CsanWrite, Args); + IRB.SetInstDebugLocation(Call); + NumInstrumentedWrites++; + return true; +} + +FunctionCallee CilkSanitizerImpl::getOrInsertSynthesizedHook(StringRef Name, + FunctionType *T, + AttributeList AL) { + // If no bitcode file has been linked, then we cannot check if it contains a + // particular library hook. Simply return the hook. If the Cilksan library + // doesn't contain that hook, the linker will raise an error. + if (!LinkedBitcode) + return getHookFunction(Name, T, AL); + + // Check if the linked bitcode file contains the library hook. If it does, + // return that hook. + if (FunctionsInBitcode.contains(std::string(Name))) + return getHookFunction(Name, T, AL); + + // We did not find the library hook in the linked bitcode file. Synthesize a + // default version of the hook that simply calls __csan_default_libhook. + FunctionCallee NewHook = M.getOrInsertFunction(Name, T, AL); + Function *NewHookFn = cast(NewHook.getCallee()); + NewHookFn->setOnlyAccessesInaccessibleMemOrArgMem(); + NewHookFn->setDoesNotThrow(); + BasicBlock *Entry = BasicBlock::Create(M.getContext(), "entry", NewHookFn); + IRBuilder<> IRB(ReturnInst::Create(M.getContext(), Entry)); + + // Insert a call to the default library function hook + Type *IDType = IRB.getInt64Ty(); + FunctionType *DefaultHookTy = + FunctionType::get(IRB.getVoidTy(), + {/*call_id*/ + IDType, /*func_id*/ IDType, + /*MAAP_count*/ IRB.getInt8Ty()}, + /*isVarArg*/ false); + FunctionCallee DefaultHook = + M.getOrInsertFunction("__csan_default_libhook", DefaultHookTy); + IRB.CreateCall(DefaultHook, {NewHookFn->getArg(0), NewHookFn->getArg(1), + NewHookFn->getArg(2)}); + return NewHook; +} + +// Check if we need to spill a value of this type onto the stack to pass it to a +// hook. +static bool NeedToSpillType(const Type *T) { + return T->isVectorTy() || T->isStructTy(); +} + +bool CilkSanitizerImpl::instrumentIntrinsicCall( + Instruction *I, SmallVectorImpl *MAAPVals) { + assert(!callsPlaceholderFunction(*I) && + "instrumentIntrinsicCall called on placeholder function"); + + // Only insert instrumentation if requested + if (!(InstrumentationSet & SERIESPARALLEL)) + return true; + + CallBase *CB = dyn_cast(I); + if (!CB) + return false; + Function *Called = CB->getCalledFunction(); + + IRBuilder<> IRB(I); + LLVMContext &Ctx = IRB.getContext(); + uint64_t LocalId = CallsiteFED.add(*I); + Value *CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB); + Value *FuncId = GetCalleeFuncID(Called, IRB); + assert(FuncId != NULL); + + Value *NumMVVal = IRB.getInt8(0); + if (MAAPVals && !MAAPVals->empty()) { + unsigned NumMV = MAAPVals->size(); + NumMVVal = IRB.getInt8(NumMV); + } + + CsiCallProperty Prop; + // TODO: Set appropriate property values for this intrinsic call + Value *PropVal = Prop.getValue(IRB); + + // Since C/C++ does not like '.' characters in function names, convert '.' to + // '_' in the hook name. + SmallString<256> Buf; + for (char C : Called->getName().bytes()) { + if ('.' == C) + Buf.push_back('_'); + else + Buf.push_back(C); + } + Type *IDType = IRB.getInt64Ty(); + + // If the intrinsic does not return, insert the hook before the intrinsic. + if (CB->doesNotReturn()) { + // Synthesize the before hook for this function. + SmallVector BeforeHookParamTys( + {IDType, /*callee func_id*/ IDType, + /*Num MAAPVal*/ IRB.getInt8Ty(), CsiCallProperty::getType(Ctx)}); + SmallVector BeforeHookParamVals( + {CallsiteId, FuncId, NumMVVal, PropVal}); + + // Populate the BeforeHook parameters with the parameters of the + // instrumented function itself. + Value *SavedStack = nullptr; + const DataLayout &DL = M.getDataLayout(); + for (Value *Arg : CB->args()) { + Type *ArgTy = Arg->getType(); + if (!NeedToSpillType(ArgTy)) { + // We can simply pass the argument directly to the hook. + BeforeHookParamTys.push_back(ArgTy); + BeforeHookParamVals.push_back(Arg); + continue; + } + // We need to spill the argument onto the stack. + + // Save the stack pointer, if we haven't already + if (!SavedStack) + SavedStack = + IRB.CreateCall(Intrinsic::getDeclaration(&M, Intrinsic::stacksave)); + + // Spill the argument onto the stack + AllocaInst *ArgSpill = IRB.CreateAlloca(ArgTy); + IRB.CreateAlignedStore(Arg, ArgSpill, DL.getStackAlignment()); + + // Add the spilled argument + BeforeHookParamTys.push_back(ArgSpill->getType()); + BeforeHookParamVals.push_back(ArgSpill); + } + FunctionType *BeforeHookTy = FunctionType::get( + IRB.getVoidTy(), BeforeHookParamTys, Called->isVarArg()); + FunctionCallee BeforeIntrinCallHook = getOrInsertSynthesizedHook( + ("__csan_" + Buf).str(), BeforeHookTy); + + // Insert the hook before the call + insertHookCall(I, BeforeIntrinCallHook, BeforeHookParamVals); + + // If we previously saved the stack pointer, restore it + if (SavedStack) + IRB.CreateCall(Intrinsic::getDeclaration(&M, Intrinsic::stackrestore), + {SavedStack}); + return true; + } + + // Otherwise, insert the hook after the intrinsic. + assert(!isa(I) && + "instrumentIntrinsicCall called on invoke instruction"); + + BasicBlock::iterator Iter(I); + Iter++; + IRB.SetInsertPoint(&*Iter); + + // Synthesize the after hook for this function. + SmallVector AfterHookParamTys({IDType, /*callee func_id*/ IDType, + /*Num MAAPVal*/ IRB.getInt8Ty(), + CsiCallProperty::getType(Ctx)}); + SmallVector AfterHookParamVals( + {CallsiteId, FuncId, NumMVVal, PropVal}); + + // Populate the AfterHook parameters with the parameters of the instrumented + // function itself. + Value *SavedStack = nullptr; + const DataLayout &DL = M.getDataLayout(); + if (!Called->getReturnType()->isVoidTy()) { + Type *RetTy = Called->getReturnType(); + if (!NeedToSpillType(RetTy)) { + // We can simply pass the return value directly to the hook. + AfterHookParamTys.push_back(RetTy); + AfterHookParamVals.push_back(CB); + } else { + // We need to spill the return value onto the stack. + + // Save the stack pointer, if we haven't already + if (!SavedStack) + SavedStack = + IRB.CreateCall(Intrinsic::getDeclaration(&M, Intrinsic::stacksave)); + + // Spill the return value onto the stack + AllocaInst *RetSpill = IRB.CreateAlloca(RetTy); + IRB.CreateAlignedStore(CB, RetSpill, DL.getStackAlignment()); + + // Add the spilled return value + AfterHookParamTys.push_back(RetSpill->getType()); + AfterHookParamVals.push_back(RetSpill); + } + } + for (Value *Arg : CB->args()) { + Type *ArgTy = Arg->getType(); + if (!NeedToSpillType(ArgTy)) { + // We can simply pass the argument directly to the hook. + AfterHookParamTys.push_back(ArgTy); + AfterHookParamVals.push_back(Arg); + continue; + } + // We need to spill the argument onto the stack. + + // Save the stack pointer, if we haven't already + if (!SavedStack) + SavedStack = + IRB.CreateCall(Intrinsic::getDeclaration(&M, Intrinsic::stacksave)); + + // Spill the argument onto the stack + AllocaInst *ArgSpill = IRB.CreateAlloca(ArgTy); + IRB.CreateAlignedStore(Arg, ArgSpill, DL.getStackAlignment()); + + // Add the spilled argument + AfterHookParamTys.push_back(ArgSpill->getType()); + AfterHookParamVals.push_back(ArgSpill); + } + + FunctionType *AfterHookTy = + FunctionType::get(IRB.getVoidTy(), AfterHookParamTys, Called->isVarArg()); + FunctionCallee AfterIntrinCallHook = + getOrInsertSynthesizedHook(("__csan_" + Buf).str(), AfterHookTy); + + // Insert the hook call + insertHookCall(&*Iter, AfterIntrinCallHook, AfterHookParamVals); + + if (SavedStack) { + IRB.CreateCall(Intrinsic::getDeclaration(&M, Intrinsic::stackrestore), + {SavedStack}); + } + return true; +} + +bool CilkSanitizerImpl::instrumentLibCall(Instruction *I, + SmallVectorImpl *MAAPVals) { + // Only insert instrumentation if requested + if (!(InstrumentationSet & SERIESPARALLEL)) + return true; + + bool IsInvoke = isa(I); + CallBase *CB = dyn_cast(I); + if (!CB) + return false; + Function *Called = CB->getCalledFunction(); + + IRBuilder<> IRB(I); + LLVMContext &Ctx = IRB.getContext(); + uint64_t LocalId = CallsiteFED.add(*I); + Value *DefaultID = getDefaultID(IRB); + Value *CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB); + Value *FuncId = GetCalleeFuncID(Called, IRB); + assert(FuncId != NULL); + + Value *NumMVVal = IRB.getInt8(0); + if (MAAPVals && !MAAPVals->empty()) { + unsigned NumMV = MAAPVals->size(); + NumMVVal = IRB.getInt8(NumMV); + } + + CsiCallProperty Prop; + Value *DefaultPropVal = Prop.getValue(IRB); + // TODO: Set appropriate property values for this intrinsic call + Value *PropVal = Prop.getValue(IRB); + Type *IDType = IRB.getInt64Ty(); + + // If the intrinsic does not return, insert the hook before the intrinsic. + if (CB->doesNotReturn()) { + // Synthesize the before hook for this function. + SmallVector BeforeHookParamTys( + {IDType, /*callee func_id*/ IDType, + /*MAAP_count*/ IRB.getInt8Ty(), CsiCallProperty::getType(Ctx)}); + SmallVector BeforeHookParamVals( + {CallsiteId, FuncId, NumMVVal, PropVal}); + BeforeHookParamTys.append(Called->getFunctionType()->param_begin(), + Called->getFunctionType()->param_end()); + BeforeHookParamVals.append(CB->arg_begin(), CB->arg_end()); + FunctionType *BeforeHookTy = FunctionType::get( + IRB.getVoidTy(), BeforeHookParamTys, Called->isVarArg()); + FunctionCallee BeforeLibCallHook = getOrInsertSynthesizedHook( + ("__csan_" + Called->getName()).str(), BeforeHookTy); + + insertHookCall(I, BeforeLibCallHook, BeforeHookParamVals); + return true; + } + + // Otherwise, insert the hook after the intrinsic. + + // Synthesize the after hook for this function. + SmallVector AfterHookParamTys( + {IDType, /*callee func_id*/ IDType, + /*Num MAAPVal*/ IRB.getInt8Ty(), CsiCallProperty::getType(Ctx)}); + SmallVector AfterHookParamVals( + {CallsiteId, FuncId, NumMVVal, PropVal}); + SmallVector AfterHookDefaultVals( + {DefaultID, DefaultID, IRB.getInt8(0), DefaultPropVal}); + if (!Called->getReturnType()->isVoidTy()) { + AfterHookParamTys.push_back(Called->getReturnType()); + AfterHookParamVals.push_back(CB); + AfterHookDefaultVals.push_back( + Constant::getNullValue(Called->getReturnType())); + } + AfterHookParamTys.append(Called->getFunctionType()->param_begin(), + Called->getFunctionType()->param_end()); + AfterHookParamVals.append(CB->arg_begin(), CB->arg_end()); + for (Value *Arg : CB->args()) + AfterHookDefaultVals.push_back(Constant::getNullValue(Arg->getType())); + FunctionType *AfterHookTy = + FunctionType::get(IRB.getVoidTy(), AfterHookParamTys, Called->isVarArg()); + FunctionCallee AfterLibCallHook = getOrInsertSynthesizedHook( + ("__csan_" + Called->getName()).str(), AfterHookTy); + + BasicBlock::iterator Iter(I); + if (IsInvoke) { + // There are two "after" positions for invokes: the normal block and the + // exception block. + InvokeInst *II = cast(I); + insertHookCallInSuccessorBB( + II->getNormalDest(), II->getParent(), AfterLibCallHook, + AfterHookParamVals, AfterHookDefaultVals); + // Don't insert any instrumentation in the exception block. + } else { + // Simple call instruction; there is only one "after" position. + Iter++; + IRB.SetInsertPoint(&*Iter); + insertHookCall(&*Iter, AfterLibCallHook, AfterHookParamVals); + } + + return true; +} + +bool CilkSanitizerImpl::instrumentCallsite(Instruction *I, + SmallVectorImpl *MAAPVals) { + if (callsPlaceholderFunction(*I)) + return false; + + bool IsInvoke = isa(I); + CallBase *CB = dyn_cast(I); + if (!CB) + return false; + Function *Called = CB->getCalledFunction(); + + // Only insert instrumentation if requested + if (!(InstrumentationSet & SERIESPARALLEL)) + return true; + + IRBuilder<> IRB(I); + uint64_t LocalId = CallsiteFED.add(*I); + Value *DefaultID = getDefaultID(IRB); + Value *CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB); + Value *FuncId = GetCalleeFuncID(Called, IRB); + assert(FuncId != NULL); + + Value *NumMVVal = IRB.getInt8(0); + if (MAAPVals && !MAAPVals->empty()) { + unsigned NumMV = MAAPVals->size(); + NumMVVal = IRB.getInt8(NumMV); + } + + CsiCallProperty Prop; + Value *DefaultPropVal = Prop.getValue(IRB); + Prop.setIsIndirect(!Called); + Value *PropVal = Prop.getValue(IRB); + insertHookCall(I, CsanBeforeCallsite, {CallsiteId, FuncId, NumMVVal, + PropVal}); + + BasicBlock::iterator Iter(I); + if (IsInvoke) { + // There are two "after" positions for invokes: the normal block and the + // exception block. + InvokeInst *II = cast(I); + if (!CB->doesNotReturn()) { + // If this function can return normally, insert an after_call hook at the + // normal destination. + insertHookCallInSuccessorBB( + II->getNormalDest(), II->getParent(), CsanAfterCallsite, + {CallsiteId, FuncId, NumMVVal, PropVal}, + {DefaultID, DefaultID, IRB.getInt8(0), DefaultPropVal}); + } + CsiCallProperty Prop; + Prop.setIsIndirect(!Called); + Prop.setIsUnwind(); + Value *PropVal = Prop.getValue(IRB); + insertHookCallInSuccessorBB( + II->getUnwindDest(), II->getParent(), CsanAfterCallsite, + {CallsiteId, FuncId, NumMVVal, PropVal}, + {DefaultID, DefaultID, IRB.getInt8(0), DefaultPropVal}); + } else if (!CB->doesNotReturn()) { + // If this function can return normally, insert an after_call hook at the + // normal destination. + + // Simple call instruction; there is only one "after" position. + Iter++; + IRB.SetInsertPoint(&*Iter); + PropVal = Prop.getValue(IRB); + insertHookCall(&*Iter, CsanAfterCallsite, + {CallsiteId, FuncId, NumMVVal, PropVal}); + } + + return true; +} + +bool CilkSanitizerImpl::suppressCallsite(Instruction *I) { + if (callsPlaceholderFunction(*I)) + return false; + + bool IsInvoke = isa(I); + + IRBuilder<> IRB(I); + insertHookCall(I, CsanDisableChecking, {}); + + BasicBlock::iterator Iter(I); + if (IsInvoke) { + // There are two "after" positions for invokes: the normal block and the + // exception block. + InvokeInst *II = cast(I); + insertHookCallInSuccessorBB( + II->getNormalDest(), II->getParent(), CsanEnableChecking, {}, {}); + insertHookCallInSuccessorBB( + II->getUnwindDest(), II->getParent(), CsanEnableChecking, {}, {}); + } else { + // Simple call instruction; there is only one "after" position. + Iter++; + IRB.SetInsertPoint(&*Iter); + insertHookCall(&*Iter, CsanEnableChecking, {}); + } + + return true; +} + +static bool IsMemTransferDstOperand(unsigned OperandNum) { + // This check should be kept in sync with TapirRaceDetect::GetGeneralAccesses. + return (OperandNum == 0); +} + +static bool IsMemTransferSrcOperand(unsigned OperandNum) { + // This check should be kept in sync with TapirRaceDetect::GetGeneralAccesses. + return (OperandNum == 1); +} + +bool CilkSanitizerImpl::instrumentAnyMemIntrinAcc(Instruction *I, + unsigned OperandNum, + IRBuilder<> &IRB) { + CsiLoadStoreProperty Prop; + if (AnyMemTransferInst *M = dyn_cast(I)) { + // Only instrument the large load and the large store components as + // necessary. + bool Instrumented = false; + + if (IsMemTransferDstOperand(OperandNum)) { + // Only insert instrumentation if requested + if (!(InstrumentationSet & SHADOWMEMORY)) + return true; + + Value *Addr = M->getDest(); + Prop.setAlignment(M->getDestAlign()); + Prop.setIsThreadLocal(isThreadLocalObject(lookupUnderlyingObject(Addr))); + // Instrument the store + uint64_t StoreId = StoreFED.add(*I); + + // TODO: Don't recalculate underlying objects + uint64_t StoreObjId = StoreObj.add(*I, lookupUnderlyingObject(Addr)); + assert(StoreId == StoreObjId && + "Store received different ID's in FED and object tables."); + + Value *CsiId = StoreFED.localToGlobalId(StoreId, IRB); + Value *Args[] = {CsiId, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + IRB.CreateIntCast(M->getLength(), IntptrTy, false), + Prop.getValue(IRB)}; + Instruction *Call = IRB.CreateCall(CsanLargeWrite, Args); + IRB.SetInstDebugLocation(Call); + ++NumInstrumentedMemIntrinsicWrites; + Instrumented = true; + } + + if (IsMemTransferSrcOperand(OperandNum)) { + // Only insert instrumentation if requested + if (!(InstrumentationSet & SHADOWMEMORY)) + return true; + + Value *Addr = M->getSource(); + Prop.setAlignment(M->getSourceAlign()); + Prop.setIsThreadLocal(isThreadLocalObject(lookupUnderlyingObject(Addr))); + // Instrument the load + uint64_t LoadId = LoadFED.add(*I); + + // TODO: Don't recalculate underlying objects + uint64_t LoadObjId = LoadObj.add(*I, lookupUnderlyingObject(Addr)); + assert(LoadId == LoadObjId && + "Load received different ID's in FED and object tables."); + + Value *CsiId = LoadFED.localToGlobalId(LoadId, IRB); + Value *Args[] = {CsiId, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + IRB.CreateIntCast(M->getLength(), IntptrTy, false), + Prop.getValue(IRB)}; + Instruction *Call = IRB.CreateCall(CsanLargeRead, Args); + IRB.SetInstDebugLocation(Call); + ++NumInstrumentedMemIntrinsicReads; + Instrumented = true; + } + return Instrumented; + } else if (AnyMemIntrinsic *M = dyn_cast(I)) { + // Only insert instrumentation if requested + if (!(InstrumentationSet & SHADOWMEMORY)) + return true; + + Value *Addr = M->getDest(); + Prop.setAlignment(M->getDestAlign()); + Prop.setIsThreadLocal(isThreadLocalObject(lookupUnderlyingObject(Addr))); + uint64_t LocalId = StoreFED.add(*I); + + // TODO: Don't recalculate underlying objects + uint64_t StoreObjId = StoreObj.add(*I, lookupUnderlyingObject(Addr)); + assert(LocalId == StoreObjId && + "Store received different ID's in FED and object tables."); + + Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB); + Value *Args[] = {CsiId, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + IRB.CreateIntCast(M->getLength(), IntptrTy, false), + Prop.getValue(IRB)}; + Instruction *Call = IRB.CreateCall(CsanLargeWrite, Args); + IRB.SetInstDebugLocation(Call); + ++NumInstrumentedMemIntrinsicWrites; + return true; + } + return false; +} + +static void getTaskExits( + DetachInst *DI, SmallVectorImpl &TaskReturns, + SmallVectorImpl &TaskResumes, + SmallVectorImpl &SharedEHExits, + TaskInfo &TI) { + BasicBlock *DetachedBlock = DI->getDetached(); + Task *T = TI.getTaskFor(DetachedBlock); + BasicBlock *ContinueBlock = DI->getContinue(); + + // Examine the predecessors of the continue block and save any predecessors in + // the task as a task return. + for (BasicBlock *Pred : predecessors(ContinueBlock)) { + if (T->simplyEncloses(Pred)) { + assert(isa(Pred->getTerminator())); + TaskReturns.push_back(Pred); + } + } + + // If the detach cannot throw, we're done. + if (!DI->hasUnwindDest()) + return; + + // Detached-rethrow exits can appear in strange places within a task-exiting + // spindle. Hence we loop over all blocks in the spindle to find + // detached rethrows. + for (Spindle *S : depth_first>(T->getEntrySpindle())) { + if (S->isSharedEH()) { + if (llvm::any_of(predecessors(S), + [](const Spindle *Pred){ return !Pred->isSharedEH(); })) + SharedEHExits.push_back(S); + continue; + } + + for (BasicBlock *B : S->blocks()) + if (isDetachedRethrow(B->getTerminator())) + TaskResumes.push_back(B); + } +} + +bool CilkSanitizerImpl::instrumentDetach(DetachInst *DI, unsigned SyncRegNum, + unsigned NumSyncRegs, + DominatorTree &DT, TaskInfo &TI, + LoopInfo &LI) { + // Only insert instrumentation if requested + if (!(InstrumentationSet & SERIESPARALLEL)) + return true; + + LLVMContext &Ctx = DI->getContext(); + BasicBlock *TaskEntryBlock = TI.getTaskFor(DI->getParent())->getEntry(); + IRBuilder<> IDBuilder(getEntryBBInsertPt(*TaskEntryBlock)); + bool TapirLoopBody = spawnsTapirLoopBody(DI, LI, TI); + ConstantInt *SyncRegVal = ConstantInt::get(Type::getInt32Ty(Ctx), SyncRegNum); + ConstantInt *DefaultSyncRegVal = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + CsiDetachProperty DetachProp; + DetachProp.setForTapirLoopBody(TapirLoopBody); + // Instrument the detach instruction itself + Value *DetachID; + { + IRBuilder<> IRB(DI); + uint64_t LocalID = DetachFED.add(*DI); + DetachID = DetachFED.localToGlobalId(LocalID, IDBuilder); + Instruction *Call = IRB.CreateCall( + CsanDetach, {DetachID, SyncRegVal, DetachProp.getValue(IRB)}); + IRB.SetInstDebugLocation(Call); + } + NumInstrumentedDetaches++; + + // Find the detached block, continuation, and associated reattaches. + BasicBlock *DetachedBlock = DI->getDetached(); + BasicBlock *ContinueBlock = DI->getContinue(); + Task *T = TI.getTaskFor(DetachedBlock); + SmallVector TaskExits, TaskResumes; + SmallVector SharedEHExits; + getTaskExits(DI, TaskExits, TaskResumes, SharedEHExits, TI); + + // Instrument the entry and exit points of the detached task. + { + // Instrument the entry point of the detached task. + IRBuilder<> IRB(&*getFirstInsertionPtInDetachedBlock(DetachedBlock)); + uint64_t LocalID = TaskFED.add(*DetachedBlock); + Value *TaskID = TaskFED.localToGlobalId(LocalID, IDBuilder); + CsiTaskProperty Prop; + Prop.setIsTapirLoopBody(TapirLoopBody); + Prop.setNumSyncReg(NumSyncRegs); + // Get the frame and stack pointers. + Value *FrameAddr = IRB.CreateCall( + Intrinsic::getDeclaration(&M, Intrinsic::task_frameaddress), + {IRB.getInt32(0)}); + Value *StackSave = IRB.CreateCall( + Intrinsic::getDeclaration(&M, Intrinsic::stacksave)); + Instruction *Call = IRB.CreateCall(CsanTaskEntry, + {TaskID, DetachID, FrameAddr, + StackSave, Prop.getValue(IRB)}); + IRB.SetInstDebugLocation(Call); + + // Instrument the exit points of the detached tasks. + for (BasicBlock *TaskExit : TaskExits) { + IRBuilder<> IRB(TaskExit->getTerminator()); + uint64_t LocalID = TaskExitFED.add(*TaskExit->getTerminator()); + Value *TaskExitID = TaskExitFED.localToGlobalId(LocalID, IDBuilder); + CsiTaskExitProperty ExitProp; + ExitProp.setIsTapirLoopBody(TapirLoopBody); + Instruction *Call = + IRB.CreateCall(CsanTaskExit, {TaskExitID, TaskID, DetachID, + SyncRegVal, ExitProp.getValue(IRB)}); + IRB.SetInstDebugLocation(Call); + NumInstrumentedDetachExits++; + } + // Instrument the EH exits of the detached task. + for (BasicBlock *TaskExit : TaskResumes) { + IRBuilder<> IRB(TaskExit->getTerminator()); + uint64_t LocalID = TaskExitFED.add(*TaskExit->getTerminator()); + Value *TaskExitID = TaskExitFED.localToGlobalId(LocalID, IDBuilder); + CsiTaskExitProperty ExitProp; + ExitProp.setIsTapirLoopBody(TapirLoopBody); + Instruction *Call = + IRB.CreateCall(CsanTaskExit, {TaskExitID, TaskID, DetachID, + SyncRegVal, ExitProp.getValue(IRB)}); + IRB.SetInstDebugLocation(Call); + NumInstrumentedDetachExits++; + } + + Value *DefaultID = getDefaultID(IDBuilder); + for (Spindle *SharedEH : SharedEHExits) { + // Skip shared-eh spindle exits that are placeholder unreachable blocks. + if (isa( + SharedEH->getEntry()->getFirstNonPHIOrDbgOrLifetime())) + continue; + CsiTaskExitProperty ExitProp; + ExitProp.setIsTapirLoopBody(TapirLoopBody); + insertHookCallAtSharedEHSpindleExits( + SharedEH, T, CsanTaskExit, TaskExitFED, + {TaskID, DetachID, SyncRegVal, ExitProp.getValueImpl(Ctx)}, + {DefaultID, DefaultID, DefaultSyncRegVal, + CsiTaskExitProperty::getDefaultValueImpl(Ctx)}); + } + } + + // Instrument the continuation of the detach. + { + if (isCriticalContinueEdge(DI, 1)) + ContinueBlock = SplitCriticalEdge( + DI, 1, + CriticalEdgeSplittingOptions(&DT, &LI).setSplitDetachContinue()); + + IRBuilder<> IRB(&*ContinueBlock->getFirstInsertionPt()); + uint64_t LocalID = DetachContinueFED.add(*ContinueBlock); + Value *ContinueID = DetachContinueFED.localToGlobalId(LocalID, IDBuilder); + CsiDetachContinueProperty ContProp; + ContProp.setForTapirLoopBody(TapirLoopBody); + Instruction *Call = + IRB.CreateCall(CsanDetachContinue, {ContinueID, DetachID, SyncRegVal, + ContProp.getValue(IRB)}); + IRB.SetInstDebugLocation(Call); + } + // Instrument the unwind of the detach, if it exists. + if (DI->hasUnwindDest()) { + BasicBlock *UnwindBlock = DI->getUnwindDest(); + BasicBlock *PredBlock = DI->getParent(); + if (Value *TF = T->getTaskFrameUsed()) { + // If the detached task uses a taskframe, then we want to insert the + // detach_continue instrumentation for the unwind destination after the + // taskframe.resume. + UnwindBlock = getTaskFrameResumeDest(TF); + assert(UnwindBlock && + "Detach with unwind uses a taskframe with no resume"); + PredBlock = getTaskFrameResume(TF)->getParent(); + } + Value *DefaultID = getDefaultID(IDBuilder); + uint64_t LocalID = DetachContinueFED.add(*UnwindBlock); + Value *ContinueID = DetachContinueFED.localToGlobalId(LocalID, IDBuilder); + CsiDetachContinueProperty ContProp; + Value *DefaultPropVal = ContProp.getValueImpl(Ctx); + ContProp.setIsUnwind(); + ContProp.setForTapirLoopBody(TapirLoopBody); + insertHookCallInSuccessorBB( + UnwindBlock, PredBlock, CsanDetachContinue, + {ContinueID, DetachID, SyncRegVal, ContProp.getValue(Ctx)}, + {DefaultID, DefaultID, DefaultSyncRegVal, DefaultPropVal}); + for (BasicBlock *DRPred : predecessors(UnwindBlock)) + if (isDetachedRethrow(DRPred->getTerminator(), DI->getSyncRegion())) + insertHookCallInSuccessorBB( + UnwindBlock, DRPred, CsanDetachContinue, + {ContinueID, DetachID, SyncRegVal, ContProp.getValue(Ctx)}, + {DefaultID, DefaultID, DefaultSyncRegVal, DefaultPropVal}); + } + return true; +} + +bool CilkSanitizerImpl::instrumentSync(SyncInst *SI, unsigned SyncRegNum) { + // Only insert instrumentation if requested + if (!(InstrumentationSet & SERIESPARALLEL)) + return true; + + IRBuilder<> IRB(SI); + // Get the ID of this sync. + uint64_t LocalID = SyncFED.add(*SI); + Value *SyncID = SyncFED.localToGlobalId(LocalID, IRB); + // Insert instrumentation before the sync. + insertHookCall(SI, CsanSync, {SyncID, IRB.getInt32(SyncRegNum)}); + + // NOTE: Because Cilksan executes serially, any exceptions thrown before this + // sync will appear to be thrown from their respective spawns or calls, not + // the sync or the Cilk personality function. Hence we don't need + // instrumentation in the unwind destination of the sync. + + NumInstrumentedSyncs++; + return true; +} + +void CilkSanitizerImpl::instrumentTapirLoop(Loop &L, TaskInfo &TI, + DenseMap &SyncRegNums, + ScalarEvolution *SE) { + // Only insert instrumentation if requested + if (!(InstrumentationSet & SERIESPARALLEL)) + return; + + assert(L.isLoopSimplifyForm() && "CSI assumes loops are in simplified form."); + BasicBlock *Preheader = L.getLoopPreheader(); + Task *T = getTaskIfTapirLoop(&L, &TI); + assert(T && "CilkSanitizer should only instrument Tapir loops."); + unsigned SyncRegNum = SyncRegNums[T->getDetach()->getSyncRegion()]; + // We assign a local ID for this loop here, so that IDs for loops follow a + // depth-first ordering. + csi_id_t LocalId = LoopFED.add(*T->getDetach()); + + SmallVector ExitingBlocks; + L.getExitingBlocks(ExitingBlocks); + SmallVector ExitBlocks; + L.getUniqueExitBlocks(ExitBlocks); + + // Record properties of this loop. + CsiLoopProperty LoopProp; + LoopProp.setIsTapirLoop(static_cast(getTaskIfTapirLoop(&L, &TI))); + LoopProp.setHasUniqueExitingBlock((ExitingBlocks.size() == 1)); + + IRBuilder<> IRB(Preheader->getTerminator()); + Value *LoopCsiId = LoopFED.localToGlobalId(LocalId, IRB); + Value *LoopPropVal = LoopProp.getValue(IRB); + + // Try to evaluate the runtime trip count for this loop. Default to a count + // of -1 for unknown trip counts. + Value *TripCount = IRB.getInt64(-1); + if (SE) { + const SCEV *TripCountSC = getRuntimeTripCount(L, SE, true); + if (!isa(TripCountSC)) { + // Extend the TripCount type if necessary. + if (TripCountSC->getType() != IRB.getInt64Ty()) + TripCountSC = SE->getZeroExtendExpr(TripCountSC, IRB.getInt64Ty()); + // Compute the trip count to pass to the CSI hook. + SCEVExpander Expander(*SE, DL, "csi"); + TripCount = Expander.expandCodeFor(TripCountSC, IRB.getInt64Ty(), + &*IRB.GetInsertPoint()); + } + } + + // Insert before-loop hook. + insertHookCall(&*IRB.GetInsertPoint(), CsanBeforeLoop, {LoopCsiId, TripCount, + LoopPropVal}); + + // Insert after-loop hooks. + for (BasicBlock *BB : ExitBlocks) { + // If the exit block is simply enclosed inside the task, then its on an + // exceptional exit path from the task. In that case, the exit path will + // reach the unwind destination of the detach. Because the unwind + // destination of the detach is in the set of exit blocks, we can safely + // skip any exit blocks enclosed in the task. + if (!T->encloses(BB)) { + IRB.SetInsertPoint(&*BB->getFirstInsertionPt()); + insertHookCall(&*IRB.GetInsertPoint(), CsanAfterLoop, + {LoopCsiId, IRB.getInt8(SyncRegNum), LoopPropVal}); + } + } +} + +bool CilkSanitizerImpl::instrumentAlloca(Instruction *I, TaskInfo &TI) { + // Only insert instrumentation if requested + if (!(InstrumentationSet & SHADOWMEMORY)) + return true; + + IRBuilder<> IRB(I); + bool AllocaInEntryBlock = isEntryBlock(*I->getParent(), TI); + if (AllocaInEntryBlock) + IRB.SetInsertPoint(getEntryBBInsertPt(*I->getParent())); + AllocaInst *AI = cast(I); + + uint64_t LocalId = AllocaFED.add(*I); + Value *CsiId = AllocaFED.localToGlobalId(LocalId, IRB); + uint64_t AllocaObjId = AllocaObj.add(*I, I); + assert(LocalId == AllocaObjId && + "Alloca received different ID's in FED and object tables."); + + CsiAllocaProperty Prop; + Prop.setIsStatic(AI->isStaticAlloca()); + Value *PropVal = Prop.getValue(IRB); + + // Get size of allocation. + uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType()); + Value *SizeVal = IRB.getInt64(Size); + if (AI->isArrayAllocation()) + SizeVal = IRB.CreateMul(SizeVal, + IRB.CreateZExtOrBitCast(AI->getArraySize(), + IRB.getInt64Ty())); + + BasicBlock::iterator Iter(I); + if (!AllocaInEntryBlock) { + Iter++; + IRB.SetInsertPoint(&*Iter); + } else { + Iter = IRB.GetInsertPoint(); + } + + Type *AddrType = IRB.getInt8PtrTy(); + Value *Addr = IRB.CreatePointerCast(I, AddrType); + insertHookCall(&*Iter, CsiAfterAlloca, {CsiId, Addr, SizeVal, PropVal}); + + NumInstrumentedAllocas++; + return true; +} + +static Value *getHeapObject(Value *I) { + Value *Object = nullptr; + unsigned NumOfBitCastUses = 0; + + // Determine if CallInst has a bitcast use. + for (Value::user_iterator UI = I->user_begin(), E = I->user_end(); + UI != E;) + if (BitCastInst *BCI = dyn_cast(*UI++)) { + // Look for a dbg.value intrinsic for this bitcast. + SmallVector DbgValues; + findDbgValues(DbgValues, BCI); + if (!DbgValues.empty()) { + Object = BCI; + NumOfBitCastUses++; + } + } + + // Heap-allocation call has 1 debug-bitcast use, so use that bitcast as the + // object. + if (NumOfBitCastUses == 1) + return Object; + + // Otherwise just use the heap-allocation call directly. + return I; +} + +bool CilkSanitizerImpl::instrumentAllocFnLibCall(Instruction *I, + const TargetLibraryInfo *TLI) { + // Only insert instrumentation if requested + if (!(InstrumentationSet & SHADOWMEMORY)) + return true; + + bool IsInvoke = isa(I); + CallBase *CB = dyn_cast(I); + if (!CB) + return false; + Function *Called = CB->getCalledFunction(); + + // Get the CSI IDs for this hook + IRBuilder<> IRB(I); + LLVMContext &Ctx = IRB.getContext(); + Value *DefaultID = getDefaultID(IRB); + uint64_t LocalId = AllocFnFED.add(*I); + Value *AllocFnId = AllocFnFED.localToGlobalId(LocalId, IRB); + Value *FuncId = GetCalleeFuncID(Called, IRB); + assert(FuncId != NULL); + + // Get the ID for the corresponding heap object + Value *HeapObj = nullptr; + if ("posix_memalign" == Called->getName()) + HeapObj = getHeapObject(CB->getArgOperand(0)); + else + HeapObj = getHeapObject(I); + uint64_t AllocFnObjId = AllocFnObj.add(*I, HeapObj); + assert(LocalId == AllocFnObjId && + "Allocation fn received different ID's in FED and object tables."); + + // TODO: Propagate MAAPs to allocation-function library calls + Value *NumMVVal = IRB.getInt8(0); + + CsiAllocFnProperty Prop; + Value *DefaultPropVal = Prop.getValue(IRB); + LibFunc AllocLibF; + TLI->getLibFunc(*Called, AllocLibF); + Prop.setAllocFnTy(static_cast(getAllocFnTy(AllocLibF))); + Value *PropVal = Prop.getValue(IRB); + Type *IDType = IRB.getInt64Ty(); + + // Synthesize the after hook for this function. + SmallVector AfterHookParamTys({IDType, /*callee func_id*/ IDType, + /*MAAP_count*/ IRB.getInt8Ty(), + CsiAllocFnProperty::getType(Ctx)}); + SmallVector AfterHookParamVals( + {AllocFnId, FuncId, NumMVVal, PropVal}); + SmallVector AfterHookDefaultVals( + {DefaultID, DefaultID, IRB.getInt8(0), DefaultPropVal}); + if (!Called->getReturnType()->isVoidTy()) { + AfterHookParamTys.push_back(Called->getReturnType()); + AfterHookParamVals.push_back(CB); + AfterHookDefaultVals.push_back( + Constant::getNullValue(Called->getReturnType())); + } + AfterHookParamTys.append(Called->getFunctionType()->param_begin(), + Called->getFunctionType()->param_end()); + AfterHookParamVals.append(CB->arg_begin(), CB->arg_end()); + for (Value *Arg : CB->args()) + AfterHookDefaultVals.push_back(Constant::getNullValue(Arg->getType())); + FunctionType *AfterHookTy = + FunctionType::get(IRB.getVoidTy(), AfterHookParamTys, Called->isVarArg()); + FunctionCallee AfterLibCallHook = getOrInsertSynthesizedHook( + ("__csan_alloc_" + Called->getName()).str(), AfterHookTy); + + // Insert the hook after the call. + BasicBlock::iterator Iter(I); + if (IsInvoke) { + // There are two "after" positions for invokes: the normal block and the + // exception block. + InvokeInst *II = cast(I); + insertHookCallInSuccessorBB( + II->getNormalDest(), II->getParent(), AfterLibCallHook, + AfterHookParamVals, AfterHookDefaultVals); + // Don't insert any instrumentation in the exception block. + } else { + // Simple call instruction; there is only one "after" position. + Iter++; + IRB.SetInsertPoint(&*Iter); + insertHookCall(&*Iter, AfterLibCallHook, AfterHookParamVals); + } + + NumInstrumentedAllocFns++; + return true; +} + +bool CilkSanitizerImpl::instrumentAllocationFn(Instruction *I, + DominatorTree &DT, + const TargetLibraryInfo *TLI) { + // Only insert instrumentation if requested + if (!(InstrumentationSet & SHADOWMEMORY)) + return true; + + bool IsInvoke = isa(I); + assert(isa(I) && + "instrumentAllocationFn not given a call or invoke instruction."); + Function *Called = dyn_cast(I)->getCalledFunction(); + assert(Called && "Could not get called function for allocation fn."); + + IRBuilder<> IRB(I); + SmallVector AllocFnArgs; + if (!getAllocFnArgs(I, AllocFnArgs, IntptrTy, IRB.getInt8PtrTy(), *TLI)) { + return instrumentAllocFnLibCall(I, TLI); + } + SmallVector DefaultAllocFnArgs( + {/* Allocated size */ Constant::getNullValue(IntptrTy), + /* Number of elements */ Constant::getNullValue(IntptrTy), + /* Alignment */ Constant::getNullValue(IntptrTy), + /* Old pointer */ Constant::getNullValue(IRB.getInt8PtrTy()),}); + + Value *DefaultID = getDefaultID(IRB); + uint64_t LocalId = AllocFnFED.add(*I); + Value *AllocFnId = AllocFnFED.localToGlobalId(LocalId, IRB); + uint64_t AllocFnObjId = AllocFnObj.add(*I, getHeapObject(I)); + assert(LocalId == AllocFnObjId && + "Allocation fn received different ID's in FED and object tables."); + + CsiAllocFnProperty Prop; + Value *DefaultPropVal = Prop.getValue(IRB); + LibFunc AllocLibF; + TLI->getLibFunc(*Called, AllocLibF); + Prop.setAllocFnTy(static_cast(getAllocFnTy(AllocLibF))); + AllocFnArgs.push_back(Prop.getValue(IRB)); + DefaultAllocFnArgs.push_back(DefaultPropVal); + + BasicBlock::iterator Iter(I); + if (IsInvoke) { + // There are two "after" positions for invokes: the normal block and the + // exception block. + InvokeInst *II = cast(I); + + BasicBlock *NormalBB = II->getNormalDest(); + unsigned SuccNum = GetSuccessorNumber(II->getParent(), NormalBB); + if (isCriticalEdge(II, SuccNum)) + NormalBB = SplitCriticalEdge(II, SuccNum, + CriticalEdgeSplittingOptions(&DT)); + // Insert hook into normal destination. + { + IRB.SetInsertPoint(&*NormalBB->getFirstInsertionPt()); + SmallVector AfterAllocFnArgs; + AfterAllocFnArgs.push_back(AllocFnId); + AfterAllocFnArgs.push_back(IRB.CreatePointerCast(I, IRB.getInt8PtrTy())); + AfterAllocFnArgs.append(AllocFnArgs.begin(), AllocFnArgs.end()); + insertHookCall(&*IRB.GetInsertPoint(), CsanAfterAllocFn, + AfterAllocFnArgs); + } + // Insert hook into unwind destination. + { + // The return value of the allocation function is not valid in the unwind + // destination. + SmallVector AfterAllocFnArgs, DefaultAfterAllocFnArgs; + AfterAllocFnArgs.push_back(AllocFnId); + AfterAllocFnArgs.push_back(Constant::getNullValue(IRB.getInt8PtrTy())); + AfterAllocFnArgs.append(AllocFnArgs.begin(), AllocFnArgs.end()); + DefaultAfterAllocFnArgs.push_back(DefaultID); + DefaultAfterAllocFnArgs.push_back( + Constant::getNullValue(IRB.getInt8PtrTy())); + DefaultAfterAllocFnArgs.append(DefaultAllocFnArgs.begin(), + DefaultAllocFnArgs.end()); + insertHookCallInSuccessorBB( + II->getUnwindDest(), II->getParent(), CsanAfterAllocFn, + AfterAllocFnArgs, DefaultAfterAllocFnArgs); + } + } else { + // Simple call instruction; there is only one "after" position. + Iter++; + IRB.SetInsertPoint(&*Iter); + SmallVector AfterAllocFnArgs; + AfterAllocFnArgs.push_back(AllocFnId); + AfterAllocFnArgs.push_back(IRB.CreatePointerCast(I, IRB.getInt8PtrTy())); + AfterAllocFnArgs.append(AllocFnArgs.begin(), AllocFnArgs.end()); + insertHookCall(&*Iter, CsanAfterAllocFn, AfterAllocFnArgs); + } + + NumInstrumentedAllocFns++; + return true; +} + +bool CilkSanitizerImpl::instrumentFree(Instruction *I, + const TargetLibraryInfo *TLI) { + // Only insert instrumentation if requested + if (!(InstrumentationSet & SHADOWMEMORY)) + return true; + + // It appears that frees (and deletes) never throw. + assert(isa(I) && "Free call is not a call instruction"); + + CallInst *FC = cast(I); + Function *Called = FC->getCalledFunction(); + assert(Called && "Could not get called function for free."); + + IRBuilder<> IRB(I); + uint64_t LocalId = FreeFED.add(*I); + Value *FreeId = FreeFED.localToGlobalId(LocalId, IRB); + + // All currently supported free functions free the first argument. + Value *Addr = FC->getArgOperand(0); + CsiFreeProperty Prop; + LibFunc FreeLibF; + TLI->getLibFunc(*Called, FreeLibF); + Prop.setFreeTy(static_cast(getFreeTy(FreeLibF))); + + BasicBlock::iterator Iter(I); + Iter++; + IRB.SetInsertPoint(&*Iter); + insertHookCall(&*Iter, CsanAfterFree, {FreeId, Addr, Prop.getValue(IRB)}); + + NumInstrumentedFrees++; + return true; +} + +bool CilkSanitizerLegacyPass::runOnModule(Module &M) { + if (skipModule(M)) + return false; + + CallGraph *CG = &getAnalysis().getCallGraph(); + auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis().getTLI(F); + }; + auto GetDomTree = [this](Function &F) -> DominatorTree & { + return this->getAnalysis(F).getDomTree(); + }; + auto GetTaskInfo = [this](Function &F) -> TaskInfo & { + return this->getAnalysis(F).getTaskInfo(); + }; + auto GetLoopInfo = [this](Function &F) -> LoopInfo & { + return this->getAnalysis(F).getLoopInfo(); + }; + auto GetRaceInfo = [this](Function &F) -> RaceInfo & { + return this->getAnalysis(F).getRaceInfo(); + }; + auto GetSE = [this](Function &F) -> ScalarEvolution & { + return this->getAnalysis(F).getSE(); + }; + + bool Changed = + CilkSanitizerImpl(M, CG, GetDomTree, nullptr, GetLoopInfo, nullptr, + GetTLI, nullptr, CallsMayThrow, JitMode) + .setup(true); + Changed |= + CilkSanitizerImpl(M, CG, GetDomTree, GetTaskInfo, GetLoopInfo, + GetRaceInfo, GetTLI, GetSE, CallsMayThrow, JitMode) + .run(); + return Changed; +} + +PreservedAnalyses CilkSanitizerPass::run(Module &M, ModuleAnalysisManager &AM) { + auto &FAM = AM.getResult(M).getManager(); + auto &CG = AM.getResult(M); + auto GetDT = + [&FAM](Function &F) -> DominatorTree & { + return FAM.getResult(F); + }; + auto GetTI = + [&FAM](Function &F) -> TaskInfo & { + return FAM.getResult(F); + }; + auto GetLI = + [&FAM](Function &F) -> LoopInfo & { + return FAM.getResult(F); + }; + auto GetRI = + [&FAM](Function &F) -> RaceInfo & { + return FAM.getResult(F); + }; + auto GetTLI = + [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult(F); + }; + auto GetSE = [&FAM](Function &F) -> ScalarEvolution & { + return FAM.getResult(F); + }; + + bool Changed = + CilkSanitizerImpl(M, &CG, GetDT, nullptr, GetLI, nullptr, GetTLI, nullptr) + .setup(false); + Changed |= + CilkSanitizerImpl(M, &CG, GetDT, GetTI, GetLI, GetRI, GetTLI, GetSE) + .run(); + + if (!Changed) + return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.cpp new file mode 100644 index 000000000000000..dab90267f9af374 --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.cpp @@ -0,0 +1,2945 @@ +//===- ComprehensiveStaticInstrumentation.cpp - CSI compiler pass ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is part of CSI, a framework that provides comprehensive static +// instrumentation. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/EHPersonalities.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/InitializePasses.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/DynamicLibrary.h" +#include "llvm/Support/ModRef.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Instrumentation/CSI.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/EscapeEnumerator.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "csi" + +static cl::opt + ClInstrumentFuncEntryExit("csi-instrument-func-entry-exit", cl::init(true), + cl::desc("Instrument function entry and exit"), + cl::Hidden); +static cl::opt + ClInstrumentLoops("csi-instrument-loops", cl::init(true), + cl::desc("Instrument loops"), cl::Hidden); +static cl::opt + ClInstrumentBasicBlocks("csi-instrument-basic-blocks", cl::init(true), + cl::desc("Instrument basic blocks"), cl::Hidden); +static cl::opt + ClInstrumentMemoryAccesses("csi-instrument-memory-accesses", cl::init(true), + cl::desc("Instrument memory accesses"), + cl::Hidden); +static cl::opt ClInstrumentCalls("csi-instrument-function-calls", + cl::init(true), + cl::desc("Instrument function calls"), + cl::Hidden); +static cl::opt ClInstrumentAtomics("csi-instrument-atomics", + cl::init(true), + cl::desc("Instrument atomics"), + cl::Hidden); +static cl::opt ClInstrumentMemIntrinsics( + "csi-instrument-memintrinsics", cl::init(true), + cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden); +static cl::opt ClInstrumentTapir("csi-instrument-tapir", cl::init(true), + cl::desc("Instrument tapir constructs"), + cl::Hidden); +static cl::opt ClInstrumentAllocas("csi-instrument-alloca", + cl::init(true), + cl::desc("Instrument allocas"), + cl::Hidden); +static cl::opt + ClInstrumentAllocFns("csi-instrument-allocfn", cl::init(true), + cl::desc("Instrument allocation functions"), + cl::Hidden); + +static cl::opt ClInterpose("csi-interpose", cl::init(true), + cl::desc("Enable function interpositioning"), + cl::Hidden); + +static cl::opt ClToolBitcode( + "csi-tool-bitcode", cl::init(""), + cl::desc("Path to the tool bitcode file for compile-time instrumentation"), + cl::Hidden); + +static cl::opt + ClRuntimeBitcode("csi-runtime-bitcode", cl::init(""), + cl::desc("Path to the CSI runtime bitcode file for " + "optimized compile-time instrumentation"), + cl::Hidden); + +static cl::opt ClToolLibrary( + "csi-tool-library", cl::init(""), + cl::desc("Path to the tool library file for compile-time instrumentation"), + cl::Hidden); + +static cl::opt ClConfigurationFilename( + "csi-config-filename", cl::init(""), + cl::desc("Path to the configuration file for surgical instrumentation"), + cl::Hidden); + +static cl::opt ClConfigurationMode( + "csi-config-mode", cl::init(InstrumentationConfigMode::WHITELIST), + cl::values(clEnumValN(InstrumentationConfigMode::WHITELIST, "whitelist", + "Use configuration file as a whitelist"), + clEnumValN(InstrumentationConfigMode::BLACKLIST, "blacklist", + "Use configuration file as a blacklist")), + cl::desc("Specifies how to interpret the configuration file"), cl::Hidden); + +static cl::opt + AssumeNoExceptions( + "csi-assume-no-exceptions", cl::init(false), cl::Hidden, + cl::desc("Assume that ordinary calls cannot throw exceptions.")); + +static cl::opt + SplitBlocksAtCalls( + "csi-split-blocks-at-calls", cl::init(true), cl::Hidden, + cl::desc("Split basic blocks at function calls.")); + +static size_t numPassRuns = 0; +bool IsFirstRun() { return numPassRuns == 0; } + +namespace { + +static CSIOptions OverrideFromCL(CSIOptions Options) { + Options.InstrumentFuncEntryExit = ClInstrumentFuncEntryExit; + Options.InstrumentLoops = ClInstrumentLoops; + Options.InstrumentBasicBlocks = ClInstrumentBasicBlocks; + Options.InstrumentMemoryAccesses = ClInstrumentMemoryAccesses; + Options.InstrumentCalls = ClInstrumentCalls; + Options.InstrumentAtomics = ClInstrumentAtomics; + Options.InstrumentMemIntrinsics = ClInstrumentMemIntrinsics; + Options.InstrumentTapir = ClInstrumentTapir; + Options.InstrumentAllocas = ClInstrumentAllocas; + Options.InstrumentAllocFns = ClInstrumentAllocFns; + Options.CallsMayThrow = !AssumeNoExceptions; + Options.CallsTerminateBlocks = SplitBlocksAtCalls; + return Options; +} + +/// The Comprehensive Static Instrumentation pass. +/// Inserts calls to user-defined hooks at predefined points in the IR. +struct ComprehensiveStaticInstrumentationLegacyPass : public ModulePass { + static char ID; // Pass identification, replacement for typeid. + + ComprehensiveStaticInstrumentationLegacyPass( + const CSIOptions &Options = OverrideFromCL(CSIOptions())) + : ModulePass(ID), Options(Options) { + initializeComprehensiveStaticInstrumentationLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + StringRef getPassName() const override { + return "ComprehensiveStaticInstrumentation"; + } + bool runOnModule(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + CSIOptions Options; +}; // struct ComprehensiveStaticInstrumentation +} // anonymous namespace + +char ComprehensiveStaticInstrumentationLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(ComprehensiveStaticInstrumentationLegacyPass, "csi", + "ComprehensiveStaticInstrumentation pass", false, false) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(ComprehensiveStaticInstrumentationLegacyPass, "csi", + "ComprehensiveStaticInstrumentation pass", false, false) + +/// Return the first DILocation in the given basic block, or nullptr +/// if none exists. +static const DILocation *getFirstDebugLoc(const BasicBlock &BB) { + for (const Instruction &Inst : BB) + if (const DILocation *Loc = Inst.getDebugLoc()) + return Loc; + + return nullptr; +} + +/// Set DebugLoc on the call instruction to a CSI hook, based on the +/// debug information of the instrumented instruction. +static void setInstrumentationDebugLoc(Instruction *Instrumented, + Instruction *Call) { + DISubprogram *Subprog = Instrumented->getFunction()->getSubprogram(); + if (Subprog) { + if (Instrumented->getDebugLoc()) { + Call->setDebugLoc(Instrumented->getDebugLoc()); + } else { + LLVMContext &C = Instrumented->getContext(); + Call->setDebugLoc(DILocation::get(C, 0, 0, Subprog)); + } + } +} + +/// Set DebugLoc on the call instruction to a CSI hook, based on the +/// debug information of the instrumented instruction. +static void setInstrumentationDebugLoc(BasicBlock &Instrumented, + Instruction *Call) { + DISubprogram *Subprog = Instrumented.getParent()->getSubprogram(); + if (Subprog) { + if (const DILocation *FirstDebugLoc = getFirstDebugLoc(Instrumented)) + Call->setDebugLoc(FirstDebugLoc); + else { + LLVMContext &C = Instrumented.getContext(); + Call->setDebugLoc(DILocation::get(C, 0, 0, Subprog)); + } + } +} + +bool CSISetupImpl::run() { + bool Changed = false; + for (Function &F : M) + Changed |= setupFunction(F); + return Changed; +} + +bool CSISetupImpl::setupFunction(Function &F) { + if (F.empty() || CSIImpl::shouldNotInstrumentFunction(F)) + return false; + + if (Options.CallsMayThrow) + // Promote calls to invokes to insert CSI instrumentation in + // exception-handling code. + CSIImpl::setupCalls(F); + + // If we do not assume that calls terminate blocks, or if we're not + // instrumenting basic blocks, then we're done. + if (Options.InstrumentBasicBlocks && Options.CallsTerminateBlocks) + CSIImpl::splitBlocksAtCalls(F); + + LLVM_DEBUG(dbgs() << "Setup function:\n" << F); + + return true; +} + +bool CSIImpl::callsPlaceholderFunction(const Instruction &I) { + if (isa(I)) + return true; + + if (isDetachedRethrow(&I) || isTaskFrameResume(&I) || isSyncUnwind(&I)) + return true; + + if (const IntrinsicInst *II = dyn_cast(&I)) + switch (II->getIntrinsicID()) { + default: break; + // FIXME: This list is repeated from NoTTI::getIntrinsicCost. + case Intrinsic::annotation: + case Intrinsic::assume: + case Intrinsic::sideeffect: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: + case Intrinsic::is_constant: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::objectsize: + case Intrinsic::ptr_annotation: + case Intrinsic::var_annotation: + case Intrinsic::experimental_gc_result: + case Intrinsic::experimental_gc_relocate: + case Intrinsic::experimental_noalias_scope_decl: + case Intrinsic::coro_alloc: + case Intrinsic::coro_begin: + case Intrinsic::coro_free: + case Intrinsic::coro_end: + case Intrinsic::coro_frame: + case Intrinsic::coro_size: + case Intrinsic::coro_suspend: + case Intrinsic::coro_subfn_addr: + case Intrinsic::syncregion_start: + case Intrinsic::taskframe_create: + case Intrinsic::taskframe_use: + case Intrinsic::taskframe_end: + case Intrinsic::taskframe_load_guard: + case Intrinsic::tapir_runtime_start: + case Intrinsic::tapir_runtime_end: + // These intrinsics don't actually represent code after lowering. + return true; + } + + return false; +} + +bool CSIImpl::spawnsTapirLoopBody(DetachInst *DI, LoopInfo &LI, TaskInfo &TI) { + Loop *L = LI.getLoopFor(DI->getParent()); + return (TI.getTaskFor(DI->getDetached()) == getTaskIfTapirLoop(L, &TI)); +} + +bool CSIImpl::run() { + // Link the tool bitcode once initially, to get type definitions. + linkInToolFromBitcode(ClToolBitcode); + initializeCsi(); + + for (Function &F : M) + instrumentFunction(F); + + collectUnitFEDTables(); + collectUnitSizeTables(); + + finalizeCsi(); + + if (IsFirstRun() && Options.jitMode) { + llvm::sys::DynamicLibrary::LoadLibraryPermanently(ClToolLibrary.c_str()); + } + // Link the tool bitcode a second time, for definitions of used functions. + linkInToolFromBitcode(ClToolBitcode); + linkInToolFromBitcode(ClRuntimeBitcode); + + return true; // We always insert the unit constructor. +} + +Constant *ForensicTable::getObjectStrGV(Module &M, StringRef Str, + const Twine GVName) { + LLVMContext &C = M.getContext(); + IntegerType *Int32Ty = IntegerType::get(C, 32); + Constant *Zero = ConstantInt::get(Int32Ty, 0); + Value *GepArgs[] = {Zero, Zero}; + if (Str.empty()) + return ConstantPointerNull::get( + PointerType::get(IntegerType::get(C, 8), 0)); + + Constant *NameStrConstant = ConstantDataArray::getString(C, Str); + GlobalVariable *GV = M.getGlobalVariable((GVName + Str).str(), true); + if (GV == NULL) { + GV = new GlobalVariable(M, NameStrConstant->getType(), true, + GlobalValue::PrivateLinkage, NameStrConstant, + GVName + Str, nullptr, + GlobalVariable::NotThreadLocal, 0); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + } + assert(GV); + return ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs); +} + +ForensicTable::ForensicTable(Module &M, StringRef BaseIdName, + StringRef TableName, bool UseExistingBaseId) + : TableName(TableName) { + LLVMContext &C = M.getContext(); + IntegerType *Int64Ty = IntegerType::get(C, 64); + IdCounter = 0; + + if (UseExistingBaseId) + // Try to look up an existing BaseId to use. + BaseId = M.getGlobalVariable(BaseIdName, true); + if (nullptr == BaseId) + BaseId = new GlobalVariable(M, Int64Ty, false, GlobalValue::InternalLinkage, + ConstantInt::get(Int64Ty, 0), BaseIdName); + assert(BaseId); +} + +uint64_t ForensicTable::getId(const Value *V) { + if (!ValueToLocalIdMap.count(V)) + ValueToLocalIdMap[V] = IdCounter++; + assert(ValueToLocalIdMap.count(V) && "Value not in ID map."); + return ValueToLocalIdMap[V]; +} + +Value *ForensicTable::localToGlobalId(uint64_t LocalId, + IRBuilder<> &IRB) const { + assert(BaseId); + LLVMContext &C = IRB.getContext(); + Type *BaseIdTy = IRB.getInt64Ty(); + LoadInst *Base = IRB.CreateLoad(BaseIdTy, BaseId); + MDNode *MD = MDNode::get(C, std::nullopt); + Base->setMetadata(LLVMContext::MD_invariant_load, MD); + Value *Offset = IRB.getInt64(LocalId); + return IRB.CreateAdd(Base, Offset); +} + +uint64_t SizeTable::add(const BasicBlock &BB, TargetTransformInfo *TTI) { + uint64_t ID = getId(&BB); + // Count the LLVM IR instructions + int32_t IRCost = 0; + for (const Instruction &I : BB) { + if (TTI) { + InstructionCost ICost = + TTI->getInstructionCost(&I, TargetTransformInfo::TCK_Latency); + if (!ICost.isValid()) + IRCost += static_cast(TargetTransformInfo::TCC_Basic); + else + IRCost += *(ICost.getValue()); + } else { + if (isa(I)) + continue; + if (CSIImpl::callsPlaceholderFunction(I)) + continue; + IRCost++; + } + } + add(ID, BB.size(), IRCost); + return ID; +} + +PointerType *SizeTable::getPointerType(LLVMContext &C) { + return PointerType::get(getSizeStructType(C), 0); +} + +StructType *SizeTable::getSizeStructType(LLVMContext &C) { + return StructType::get( + /* FullIRSize */ IntegerType::get(C, 32), + /* NonEmptyIRSize */ IntegerType::get(C, 32)); +} + +void SizeTable::add(uint64_t ID, int32_t FullIRSize, int32_t NonEmptyIRSize) { + assert(LocalIdToSizeMap.find(ID) == LocalIdToSizeMap.end() && + "ID already exists in FED table."); + LocalIdToSizeMap[ID] = {FullIRSize, NonEmptyIRSize}; +} + +Constant *SizeTable::insertIntoModule(Module &M) const { + LLVMContext &C = M.getContext(); + StructType *TableType = getSizeStructType(C); + IntegerType *Int32Ty = IntegerType::get(C, 32); + Constant *Zero = ConstantInt::get(Int32Ty, 0); + Value *GepArgs[] = {Zero, Zero}; + SmallVector TableEntries; + + for (uint64_t LocalID = 0; LocalID < IdCounter; ++LocalID) { + const SizeInformation &E = LocalIdToSizeMap.find(LocalID)->second; + Constant *FullIRSize = ConstantInt::get(Int32Ty, E.FullIRSize); + Constant *NonEmptyIRSize = ConstantInt::get(Int32Ty, E.NonEmptyIRSize); + // The order of arguments to ConstantStruct::get() must match the + // sizeinfo_t type in csi.h. + TableEntries.push_back( + ConstantStruct::get(TableType, FullIRSize, NonEmptyIRSize)); + } + + ArrayType *TableArrayType = ArrayType::get(TableType, TableEntries.size()); + Constant *Table = ConstantArray::get(TableArrayType, TableEntries); + GlobalVariable *GV = + new GlobalVariable(M, TableArrayType, false, GlobalValue::InternalLinkage, + Table, CsiUnitSizeTableName); + return ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs); +} + +uint64_t FrontEndDataTable::add(const Function &F) { + uint64_t ID = getId(&F); + if (F.getSubprogram()) + add(ID, F.getSubprogram()); + else + add(ID, -1, -1, F.getParent()->getName(), "", F.getName()); + return ID; +} + +uint64_t FrontEndDataTable::add(const BasicBlock &BB) { + uint64_t ID = getId(&BB); + add(ID, getFirstDebugLoc(BB)); + return ID; +} + +uint64_t FrontEndDataTable::add(const Instruction &I, + const StringRef &RealName) { + uint64_t ID = getId(&I); + if (auto DL = I.getDebugLoc()) + add(ID, DL, RealName); + else { + if (const DISubprogram *Subprog = I.getFunction()->getSubprogram()) + add(ID, (int32_t)Subprog->getLine(), -1, Subprog->getFilename(), + Subprog->getDirectory(), + RealName == "" ? Subprog->getName() : RealName); + else + add(ID, -1, -1, I.getModule()->getName(), "", + RealName == "" ? I.getFunction()->getName() : RealName); + } + return ID; +} + +PointerType *FrontEndDataTable::getPointerType(LLVMContext &C) { + return PointerType::get(getSourceLocStructType(C), 0); +} + +StructType *FrontEndDataTable::getSourceLocStructType(LLVMContext &C) { + return StructType::get( + /* Name */ PointerType::get(IntegerType::get(C, 8), 0), + /* Line */ IntegerType::get(C, 32), + /* Column */ IntegerType::get(C, 32), + /* File */ PointerType::get(IntegerType::get(C, 8), 0)); +} + +void FrontEndDataTable::add(uint64_t ID, const DILocation *Loc, + const StringRef &RealName) { + if (Loc) { + // TODO: Add location information for inlining + const DISubprogram *Subprog = Loc->getScope()->getSubprogram(); + add(ID, (int32_t)Loc->getLine(), (int32_t)Loc->getColumn(), + Loc->getFilename(), Loc->getDirectory(), + RealName == "" ? Subprog->getName() : RealName); + } else + add(ID); +} + +void FrontEndDataTable::add(uint64_t ID, const DISubprogram *Subprog) { + if (Subprog) + add(ID, (int32_t)Subprog->getLine(), -1, Subprog->getFilename(), + Subprog->getDirectory(), Subprog->getName()); + else + add(ID); +} + +void FrontEndDataTable::add(uint64_t ID, int32_t Line, int32_t Column, + StringRef Filename, StringRef Directory, + StringRef Name) { + // TODO: This assert is too strong for unwind basic blocks' FED. + /*assert(LocalIdToSourceLocationMap.find(ID) == + LocalIdToSourceLocationMap.end() && + "Id already exists in FED table."); */ + LocalIdToSourceLocationMap[ID] = {Name, Line, Column, Filename, Directory}; +} + +// The order of arguments to ConstantStruct::get() must match the source_loc_t +// type in csi.h. +static void addFEDTableEntries(SmallVectorImpl &FEDEntries, + StructType *FedType, Constant *Name, + Constant *Line, Constant *Column, + Constant *File) { + FEDEntries.push_back(ConstantStruct::get(FedType, Name, Line, Column, File)); +} + +Constant *FrontEndDataTable::insertIntoModule(Module &M) const { + LLVMContext &C = M.getContext(); + StructType *FedType = getSourceLocStructType(C); + IntegerType *Int32Ty = IntegerType::get(C, 32); + Constant *Zero = ConstantInt::get(Int32Ty, 0); + Value *GepArgs[] = {Zero, Zero}; + SmallVector FEDEntries; + + for (uint64_t LocalID = 0; LocalID < IdCounter; ++LocalID) { + const SourceLocation &E = LocalIdToSourceLocationMap.find(LocalID)->second; + Constant *Line = ConstantInt::get(Int32Ty, E.Line); + Constant *Column = ConstantInt::get(Int32Ty, E.Column); + Constant *File; + { + std::string Filename = E.Filename.str(); + if (!E.Directory.empty()) + Filename = E.Directory.str() + "/" + Filename; + File = getObjectStrGV(M, Filename, "__csi_unit_filename_"); + } + Constant *Name = getObjectStrGV(M, E.Name, "__csi_unit_function_name_"); + addFEDTableEntries(FEDEntries, FedType, Name, Line, Column, File); + } + + ArrayType *FedArrayType = ArrayType::get(FedType, FEDEntries.size()); + Constant *Table = ConstantArray::get(FedArrayType, FEDEntries); + GlobalVariable *GV = + new GlobalVariable(M, FedArrayType, false, GlobalValue::InternalLinkage, + Table, CsiUnitFedTableName + BaseId->getName()); + return ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs); +} + +/// Function entry and exit hook initialization +void CSIImpl::initializeFuncHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + // Initialize function entry hook + Type *FuncPropertyTy = CsiFuncProperty::getType(C); + CsiFuncEntry = M.getOrInsertFunction("__csi_func_entry", IRB.getVoidTy(), + IRB.getInt64Ty(), FuncPropertyTy); + // Initialize function exit hook + Type *FuncExitPropertyTy = CsiFuncExitProperty::getType(C); + CsiFuncExit = M.getOrInsertFunction("__csi_func_exit", IRB.getVoidTy(), + IRB.getInt64Ty(), IRB.getInt64Ty(), + FuncExitPropertyTy); +} + +/// Basic-block hook initialization +void CSIImpl::initializeBasicBlockHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + Type *PropertyTy = CsiBBProperty::getType(C); + CsiBBEntry = M.getOrInsertFunction("__csi_bb_entry", IRB.getVoidTy(), + IRB.getInt64Ty(), PropertyTy); + CsiBBExit = M.getOrInsertFunction("__csi_bb_exit", IRB.getVoidTy(), + IRB.getInt64Ty(), PropertyTy); +} + +/// Loop hook initialization +void CSIImpl::initializeLoopHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + Type *IDType = IRB.getInt64Ty(); + Type *LoopPropertyTy = CsiLoopProperty::getType(C); + Type *LoopExitPropertyTy = CsiLoopExitProperty::getType(C); + + CsiBeforeLoop = M.getOrInsertFunction("__csi_before_loop", IRB.getVoidTy(), + IDType, IRB.getInt64Ty(), + LoopPropertyTy); + CsiAfterLoop = M.getOrInsertFunction("__csi_after_loop", IRB.getVoidTy(), + IDType, LoopPropertyTy); + + CsiLoopBodyEntry = M.getOrInsertFunction("__csi_loopbody_entry", + IRB.getVoidTy(), IDType, + LoopPropertyTy); + CsiLoopBodyExit = M.getOrInsertFunction("__csi_loopbody_exit", + IRB.getVoidTy(), IDType, IDType, + LoopExitPropertyTy); +} + +// Call-site hook initialization +void CSIImpl::initializeCallsiteHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + Type *PropertyTy = CsiCallProperty::getType(C); + CsiBeforeCallsite = M.getOrInsertFunction("__csi_before_call", + IRB.getVoidTy(), IRB.getInt64Ty(), + IRB.getInt64Ty(), PropertyTy); + CsiAfterCallsite = M.getOrInsertFunction("__csi_after_call", IRB.getVoidTy(), + IRB.getInt64Ty(), IRB.getInt64Ty(), + PropertyTy); +} + +// Alloca (local variable) hook initialization +void CSIImpl::initializeAllocaHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + Type *IDType = IRB.getInt64Ty(); + Type *AddrType = IRB.getInt8PtrTy(); + Type *PropType = CsiAllocaProperty::getType(C); + + CsiAfterAlloca = M.getOrInsertFunction("__csi_after_alloca", IRB.getVoidTy(), + IDType, AddrType, IntptrTy, PropType); +} + +// Non-local-variable allocation/free hook initialization +void CSIImpl::initializeAllocFnHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + Type *RetType = IRB.getVoidTy(); + Type *IDType = IRB.getInt64Ty(); + Type *AddrType = IRB.getInt8PtrTy(); + Type *LargeNumBytesType = IntptrTy; + Type *AllocFnPropType = CsiAllocFnProperty::getType(C); + Type *FreePropType = CsiFreeProperty::getType(C); + + CsiBeforeAllocFn = M.getOrInsertFunction("__csi_before_allocfn", RetType, + IDType, LargeNumBytesType, + LargeNumBytesType, LargeNumBytesType, + AddrType, AllocFnPropType); + CsiAfterAllocFn = M.getOrInsertFunction("__csi_after_allocfn", RetType, + IDType, /* new ptr */ AddrType, + /* size */ LargeNumBytesType, + /* num elements */ LargeNumBytesType, + /* alignment */ LargeNumBytesType, + /* old ptr */ AddrType, + /* property */ AllocFnPropType); + + CsiBeforeFree = M.getOrInsertFunction("__csi_before_free", RetType, IDType, + AddrType, FreePropType); + CsiAfterFree = M.getOrInsertFunction("__csi_after_free", RetType, IDType, + AddrType, FreePropType); +} + +// Load and store hook initialization +void CSIImpl::initializeLoadStoreHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + Type *LoadPropertyTy = CsiLoadStoreProperty::getType(C); + Type *StorePropertyTy = CsiLoadStoreProperty::getType(C); + Type *RetType = IRB.getVoidTy(); + Type *AddrType = IRB.getInt8PtrTy(); + Type *NumBytesType = IRB.getInt32Ty(); + + CsiBeforeRead = M.getOrInsertFunction("__csi_before_load", RetType, + IRB.getInt64Ty(), AddrType, + NumBytesType, LoadPropertyTy); + CsiAfterRead = M.getOrInsertFunction("__csi_after_load", RetType, + IRB.getInt64Ty(), AddrType, NumBytesType, + LoadPropertyTy); + + CsiBeforeWrite = M.getOrInsertFunction("__csi_before_store", RetType, + IRB.getInt64Ty(), AddrType, + NumBytesType, StorePropertyTy); + CsiAfterWrite = M.getOrInsertFunction("__csi_after_store", RetType, + IRB.getInt64Ty(), AddrType, + NumBytesType, StorePropertyTy); +} + +// Initialization of hooks for LLVM memory intrinsics +void CSIImpl::initializeMemIntrinsicsHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + + MemmoveFn = M.getOrInsertFunction("memmove", IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), + IntptrTy); + MemcpyFn = M.getOrInsertFunction("memcpy", IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), + IntptrTy); + MemsetFn = M.getOrInsertFunction("memset", IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IRB.getInt32Ty(), + IntptrTy); +} + +// Initialization of Tapir hooks +void CSIImpl::initializeTapirHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + Type *IDType = IRB.getInt64Ty(); + Type *SyncRegType = IRB.getInt32Ty(); + Type *RetType = IRB.getVoidTy(); + Type *TaskPropertyTy = CsiTaskProperty::getType(C); + Type *TaskExitPropertyTy = CsiTaskExitProperty::getType(C); + Type *DetachPropertyTy = CsiDetachProperty::getType(C); + Type *DetContPropertyTy = CsiDetachContinueProperty::getType(C); + + CsiDetach = + M.getOrInsertFunction("__csi_detach", RetType, + /* detach_id */ IDType, + /* sync_reg */ SyncRegType, DetachPropertyTy); + CsiTaskEntry = M.getOrInsertFunction("__csi_task", RetType, + /* task_id */ IDType, + /* detach_id */ IDType, TaskPropertyTy); + CsiTaskExit = + M.getOrInsertFunction("__csi_task_exit", RetType, + /* task_exit_id */ IDType, + /* task_id */ IDType, + /* detach_id */ IDType, + /* sync_reg */ SyncRegType, TaskExitPropertyTy); + CsiDetachContinue = + M.getOrInsertFunction("__csi_detach_continue", RetType, + /* detach_continue_id */ IDType, + /* detach_id */ IDType, + /* sync_reg */ SyncRegType, DetContPropertyTy); + CsiBeforeSync = + M.getOrInsertFunction("__csi_before_sync", RetType, IDType, SyncRegType); + CsiAfterSync = + M.getOrInsertFunction("__csi_after_sync", RetType, IDType, SyncRegType); +} + +// Prepare any calls in the CFG for instrumentation, e.g., by making sure any +// call that can throw is modeled with an invoke. +void CSIImpl::setupCalls(Function &F) { + // If F does not throw, then no need to promote calls to invokes. + if (F.doesNotThrow()) + return; + + promoteCallsInTasksToInvokes(F, "csi.cleanup"); +} + +static BasicBlock *SplitOffPreds(BasicBlock *BB, + SmallVectorImpl &Preds, + DominatorTree *DT, LoopInfo *LI) { + if (BB->isLandingPad()) { + SmallVector NewBBs; + SplitLandingPadPredecessors(BB, Preds, ".csi-split-lp", ".csi-split", + NewBBs, DT, LI); + return NewBBs[1]; + } + + BasicBlock *NewBB = SplitBlockPredecessors(BB, Preds, ".csi-split", DT, LI); + if (isa(BB->getFirstNonPHIOrDbg())) { + // If the block being split is simply contains an unreachable, then replace + // the terminator of the new block with an unreachable. This helps preserve + // invariants on the CFG structure for Tapir placeholder blocks following + // detached.rethrow and taskframe.resume terminators. + ReplaceInstWithInst(NewBB->getTerminator(), + new UnreachableInst(BB->getContext())); + if (DT) { + DT->deleteEdge(NewBB, BB); + } + } + return BB; +} + +// Setup each block such that all of its predecessors belong to the same CSI ID +// space. +static void setupBlock(BasicBlock *BB, const TargetLibraryInfo *TLI, + DominatorTree *DT, LoopInfo *LI) { + if (BB->getUniquePredecessor()) + return; + + SmallVector DetachPreds; + SmallVector TFResumePreds; + SmallVector SyncPreds; + SmallVector SyncUnwindPreds; + SmallVector AllocFnPreds; + SmallVector FreeFnPreds; + SmallVector InvokePreds; + bool HasOtherPredTypes = false; + unsigned NumPredTypes = 0; + + // Partition the predecessors of the landing pad. + for (BasicBlock *Pred : predecessors(BB)) { + if (isa(Pred->getTerminator()) || + isa(Pred->getTerminator()) || + isDetachedRethrow(Pred->getTerminator())) + DetachPreds.push_back(Pred); + else if (isTaskFrameResume(Pred->getTerminator())) + TFResumePreds.push_back(Pred); + else if (isa(Pred->getTerminator())) + SyncPreds.push_back(Pred); + else if (isSyncUnwind(Pred->getTerminator())) + SyncUnwindPreds.push_back(Pred); + else if (CSIImpl::isAllocFn(Pred->getTerminator(), TLI)) + AllocFnPreds.push_back(Pred); + else if (CSIImpl::isFreeFn(Pred->getTerminator(), TLI)) + FreeFnPreds.push_back(Pred); + else if (isa(Pred->getTerminator())) + InvokePreds.push_back(Pred); + else + HasOtherPredTypes = true; + } + + NumPredTypes = static_cast(!DetachPreds.empty()) + + static_cast(!TFResumePreds.empty()) + + static_cast(!SyncPreds.empty()) + + static_cast(!SyncUnwindPreds.empty()) + + static_cast(!AllocFnPreds.empty()) + + static_cast(!FreeFnPreds.empty()) + + static_cast(!InvokePreds.empty()) + + static_cast(HasOtherPredTypes); + + // Splitting predecessors works differently for landingpads versus normal + // basic blocks. If the block is not a landingpad, split off every type of + // predecessor. + unsigned NumPredTypesRequired = static_cast(BB->isLandingPad()); + if (NumPredTypes <= NumPredTypesRequired) + return; + + BasicBlock *BBToSplit = BB; + // Split off the predecessors of each type. + if (!SyncPreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, SyncPreds, DT, LI); + NumPredTypes--; + } + if (!SyncUnwindPreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, SyncUnwindPreds, DT, LI); + NumPredTypes--; + } + if (!AllocFnPreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, AllocFnPreds, DT, LI); + NumPredTypes--; + } + if (!FreeFnPreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, FreeFnPreds, DT, LI); + NumPredTypes--; + } + if (!InvokePreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, InvokePreds, DT, LI); + NumPredTypes--; + } + if (!TFResumePreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, TFResumePreds, DT, LI); + NumPredTypes--; + } + // We handle detach and detached.rethrow predecessors at the end to preserve + // invariants on the CFG structure about the deadness of basic blocks after + // detached-rethrows. + if (!DetachPreds.empty() && NumPredTypes > NumPredTypesRequired) { + BBToSplit = SplitOffPreds(BBToSplit, DetachPreds, DT, LI); + NumPredTypes--; + } +} + +// Setup all basic blocks such that each block's predecessors belong entirely to +// one CSI ID space. +void CSIImpl::setupBlocks(Function &F, const TargetLibraryInfo *TLI, + DominatorTree *DT, LoopInfo *LI) { + SmallPtrSet BlocksToSetup; + for (BasicBlock &BB : F) { + if (BB.isLandingPad()) + BlocksToSetup.insert(&BB); + + if (InvokeInst *II = dyn_cast(BB.getTerminator())) { + if (!isTapirPlaceholderSuccessor(II->getNormalDest())) + BlocksToSetup.insert(II->getNormalDest()); + } else if (SyncInst *SI = dyn_cast(BB.getTerminator())) + BlocksToSetup.insert(SI->getSuccessor(0)); + } + + for (BasicBlock *BB : BlocksToSetup) + setupBlock(BB, TLI, DT, LI); +} + +// Split basic blocks so that ordinary call instructions terminate basic blocks. +void CSIImpl::splitBlocksAtCalls(Function &F, DominatorTree *DT, LoopInfo *LI) { + // Split basic blocks after call instructions. + SmallVector CallsToSplit; + for (BasicBlock &BB : F) + for (Instruction &I : BB) + if (isa(I) && + // Skip placeholder call instructions + !callsPlaceholderFunction(I) && + // Skip a call instruction if it is immediately followed by a + // terminator + !I.getNextNode()->isTerminator() && + // If the call does not return, don't bother splitting + !cast(&I)->doesNotReturn()) + CallsToSplit.push_back(&I); + + for (Instruction *Call : CallsToSplit) + SplitBlock(Call->getParent(), Call->getNextNode(), DT, LI); +} + +bool CSIImpl::isFreeFn(const Instruction *I, const TargetLibraryInfo *TLI) { + if (!isa(I)) + return false; + const CallBase *CB = dyn_cast(I); + + if (!TLI) + return false; + + if (getFreedOperand(CB, TLI)) + return true; + + // Ideally we would just use getFreedOperand to determine whether I is a call + // to a libfree funtion. But if -fno-builtin is used, then getFreedOperand + // won't recognize any libfree functions. For instrumentation purposes, + // it's sufficient to recognize the function name. + const StringRef FreeFnNames[] = { + "_ZdlPv", + "_ZdaPv", + "_ZdlPvj", + "_ZdlPvm", + "_ZdlPvRKSt9nothrow_t", + "_ZdlPvSt11align_val_t", + "_ZdaPvj", + "_ZdaPvm", + "_ZdaPvRKSt9nothrow_t", + "_ZdaPvSt11align_val_t", + "_ZdlPvSt11align_val_tRKSt9nothrow_t", + "_ZdaPvSt11align_val_tRKSt9nothrow_t", + "_ZdlPvjSt11align_val_t", + "_ZdlPvmSt11align_val_t", + "_ZdaPvjSt11align_val_t", + "_ZdaPvmSt11align_val_t", + "??3@YAXPAX@Z", + "??3@YAXPAXABUnothrow_t@std@@@Z", + "??3@YAXPAXI@Z", + "??3@YAXPEAX@Z", + "??3@YAXPEAXAEBUnothrow_t@std@@@Z", + "??3@YAXPEAX_K@Z", + "??_V@YAXPAX@Z", + "??_V@YAXPAXABUnothrow_t@std@@@Z", + "??_V@YAXPAXI@Z", + "??_V@YAXPEAX@Z", + "??_V@YAXPEAXAEBUnothrow_t@std@@@Z", + "??_V@YAXPEAX_K@Z", + "__kmpc_free_shared" + }; + + if (const Function *Called = CB->getCalledFunction()) { + StringRef FnName = Called->getName(); + if (!llvm::any_of(FreeFnNames, [&](const StringRef FreeFnName) { + return FnName == FreeFnName; + })) + return false; + + // Confirm that this function is a recognized library function + LibFunc F; + bool FoundLibFunc = TLI->getLibFunc(*Called, F); + return FoundLibFunc; + } + + return false; +} + +bool CSIImpl::isAllocFn(const Instruction *I, const TargetLibraryInfo *TLI) { + if (!isa(I)) + return false; + + if (!TLI) + return false; + + if (isAllocationFn(I, TLI)) + return true; + + // Ideally we would just use isAllocationFn to determine whether I is a call + // to an allocation funtion. But if -fno-builtin is used, then isAllocationFn + // won't recognize any allocation functions. For instrumentation purposes, + // it's sufficient to recognize the function name. + const StringRef AllocFnNames[] = { + "_Znwj", + "_ZnwjRKSt9nothrow_t", + "_ZnwjSt11align_val_t", + "_ZnwjSt11align_val_tRKSt9nothrow_t", + "_Znwm", + "_ZnwmRKSt9nothrow_t", + "_ZnwmSt11align_val_t", + "_ZnwmSt11align_val_tRKSt9nothrow_t", + "_Znaj", + "_ZnajRKSt9nothrow_t", + "_ZnajSt11align_val_t", + "_ZnajSt11align_val_tRKSt9nothrow_t", + "_Znam", + "_ZnamRKSt9nothrow_t", + "_ZnamSt11align_val_t", + "_ZnamSt11align_val_tRKSt9nothrow_t", + "??2@YAPAXI@Z", + "??2@YAPAXIABUnothrow_t@std@@@Z", + "??2@YAPEAX_K@Z", + "??2@YAPEAX_KAEBUnothrow_t@std@@@Z", + "??_U@YAPAXI@Z", + "??_U@YAPAXIABUnothrow_t@std@@@Z", + "??_U@YAPEAX_K@Z", + "??_U@YAPEAX_KAEBUnothrow_t@std@@@Z", + "strdup", + "dunder_strdup", + "strndup", + "dunder_strndup", + "__kmpc_alloc_shared", + "posix_memalign" + }; + + if (const Function *Called = dyn_cast(I)->getCalledFunction()) { + StringRef FnName = Called->getName(); + if (!llvm::any_of(AllocFnNames, [&](const StringRef AllocFnName) { + return FnName == AllocFnName; + })) + return false; + + // Confirm that this function is a recognized library function + LibFunc F; + bool FoundLibFunc = TLI->getLibFunc(*Called, F); + return FoundLibFunc; + } + + return false; +} + +int CSIImpl::getNumBytesAccessed(Type *OrigTy, const DataLayout &DL) { + assert(OrigTy->isSized()); + uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy); + if (TypeSize % 8 != 0) + return -1; + return TypeSize / 8; +} + +void CSIImpl::addLoadStoreInstrumentation(Instruction *I, + FunctionCallee BeforeFn, + FunctionCallee AfterFn, Value *CsiId, + Type *AddrType, Value *Addr, + int NumBytes, + CsiLoadStoreProperty &Prop) { + IRBuilder<> IRB(I); + Value *PropVal = Prop.getValue(IRB); + insertHookCall(I, BeforeFn, + {CsiId, IRB.CreatePointerCast(Addr, AddrType), + IRB.getInt32(NumBytes), PropVal}); + + BasicBlock::iterator Iter = ++I->getIterator(); + IRB.SetInsertPoint(&*Iter); + insertHookCall(&*Iter, AfterFn, + {CsiId, IRB.CreatePointerCast(Addr, AddrType), + IRB.getInt32(NumBytes), PropVal}); +} + +void CSIImpl::instrumentLoadOrStore(Instruction *I, + CsiLoadStoreProperty &Prop) { + IRBuilder<> IRB(I); + bool IsWrite = isa(I); + Value *Addr = IsWrite ? cast(I)->getPointerOperand() + : cast(I)->getPointerOperand(); + Type *Ty = + IsWrite ? cast(I)->getValueOperand()->getType() : I->getType(); + int NumBytes = getNumBytesAccessed(Ty, DL); + Type *AddrType = IRB.getInt8PtrTy(); + + if (NumBytes == -1) + return; // size that we don't recognize + + if (IsWrite) { + uint64_t LocalId = StoreFED.add(*I); + Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB); + addLoadStoreInstrumentation(I, CsiBeforeWrite, CsiAfterWrite, CsiId, + AddrType, Addr, NumBytes, Prop); + } else { // is read + uint64_t LocalId = LoadFED.add(*I); + Value *CsiId = LoadFED.localToGlobalId(LocalId, IRB); + addLoadStoreInstrumentation(I, CsiBeforeRead, CsiAfterRead, CsiId, AddrType, + Addr, NumBytes, Prop); + } +} + +void CSIImpl::instrumentAtomic(Instruction *I) { + // For now, print a message that this code contains atomics. + dbgs() + << "WARNING: Uninstrumented atomic operations in program-under-test!\n"; +} + +// TODO: This code for instrumenting memory intrinsics was borrowed +// from TSan. Different tools might have better ways to handle these +// function calls. Replace this logic with a more flexible solution, +// possibly one based on interpositioning. +// +// If a memset intrinsic gets inlined by the code gen, we will miss it. +// So, we either need to ensure the intrinsic is not inlined, or instrument it. +// We do not instrument memset/memmove/memcpy intrinsics (too complicated), +// instead we simply replace them with regular function calls, which are then +// intercepted by the run-time. +// Since our pass runs after everyone else, the calls should not be +// replaced back with intrinsics. If that becomes wrong at some point, +// we will need to call e.g. __csi_memset to avoid the intrinsics. +bool CSIImpl::instrumentMemIntrinsic(Instruction *I) { + IRBuilder<> IRB(I); + if (MemSetInst *M = dyn_cast(I)) { + Instruction *Call = IRB.CreateCall( + MemsetFn, + {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()), + IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false), + IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)}); + setInstrumentationDebugLoc(I, Call); + I->eraseFromParent(); + return true; + } else if (MemTransferInst *M = dyn_cast(I)) { + Instruction *Call = IRB.CreateCall( + isa(M) ? MemcpyFn : MemmoveFn, + {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()), + IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()), + IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)}); + setInstrumentationDebugLoc(I, Call); + I->eraseFromParent(); + return true; + } + return false; +} + +void CSIImpl::instrumentBasicBlock(BasicBlock &BB, const TaskInfo &TI) { + IRBuilder<> IRB(&*BB.getFirstInsertionPt()); + bool isEntry = isEntryBlock(BB, TI); + if (isEntry) + IRB.SetInsertPoint(getEntryBBInsertPt(BB)); + uint64_t LocalId = BasicBlockFED.add(BB); + uint64_t BBSizeId = BBSize.add(BB, GetTTI ? + &(*GetTTI)(*BB.getParent()) : nullptr); + assert(LocalId == BBSizeId && + "BB recieved different ID's in FED and sizeinfo tables."); + Value *CsiId = BasicBlockFED.localToGlobalId(LocalId, IRB); + CsiBBProperty Prop; + Prop.setIsLandingPad(BB.isLandingPad()); + Prop.setIsEHPad(BB.isEHPad()); + Instruction *TermI = BB.getTerminator(); + Value *PropVal = Prop.getValue(IRB); + insertHookCall(&*IRB.GetInsertPoint(), CsiBBEntry, {CsiId, PropVal}); + IRB.SetInsertPoint(TermI); + CallInst *Call = insertHookCall(TermI, CsiBBExit, {CsiId, PropVal}); + // If this is an entry block and the insert point is the terminator, make the + // BBExit hook be the insert point instead. + if (isEntry && getEntryBBInsertPt(BB) == TermI) + EntryBBInsertPt[&BB] = Call; +} + +// Helper function to get a value for the runtime trip count of the given loop. +static const SCEV *getRuntimeTripCount(Loop &L, ScalarEvolution *SE) { + BasicBlock *Latch = L.getLoopLatch(); + + const SCEV *BECountSC = SE->getExitCount(&L, Latch); + if (isa(BECountSC) || + !BECountSC->getType()->isIntegerTy()) { + LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n"); + return SE->getCouldNotCompute(); + } + + // Add 1 since the backedge count doesn't include the first loop iteration. + const SCEV *TripCountSC = + SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); + if (isa(TripCountSC)) { + LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n"); + return SE->getCouldNotCompute(); + } + + return TripCountSC; +} + +void CSIImpl::instrumentLoop(Loop &L, TaskInfo &TI, ScalarEvolution *SE) { + assert(L.isLoopSimplifyForm() && "CSI assumes loops are in simplified form."); + BasicBlock *Preheader = L.getLoopPreheader(); + BasicBlock *Header = L.getHeader(); + SmallVector ExitingBlocks, ExitBlocks; + L.getExitingBlocks(ExitingBlocks); + L.getUniqueExitBlocks(ExitBlocks); + + // We assign a local ID for this loop here, so that IDs for loops follow a + // depth-first ordering. + csi_id_t LocalId = LoopFED.add(*Header); + + // Recursively instrument each subloop. + for (Loop *SubL : L) + instrumentLoop(*SubL, TI, SE); + + // Record properties of this loop. + CsiLoopProperty LoopProp; + LoopProp.setIsTapirLoop(static_cast(getTaskIfTapirLoop(&L, &TI))); + LoopProp.setHasUniqueExitingBlock((ExitingBlocks.size() == 1)); + + IRBuilder<> IRB(Preheader->getTerminator()); + Value *LoopCsiId = LoopFED.localToGlobalId(LocalId, IRB); + Value *LoopPropVal = LoopProp.getValue(IRB); + + // Try to evaluate the runtime trip count for this loop. Default to a count + // of -1 for unknown trip counts. + Value *TripCount = IRB.getInt64(-1); + if (SE) { + const SCEV *TripCountSC = getRuntimeTripCount(L, SE); + if (!isa(TripCountSC)) { + // Extend the TripCount type if necessary. + if (TripCountSC->getType() != IRB.getInt64Ty()) + TripCountSC = SE->getZeroExtendExpr(TripCountSC, IRB.getInt64Ty()); + // Compute the trip count to pass to the CSI hook. + SCEVExpander Expander(*SE, DL, "csi"); + TripCount = Expander.expandCodeFor(TripCountSC, IRB.getInt64Ty(), + &*IRB.GetInsertPoint()); + } + } + + // Insert before-loop hook. + insertHookCall(&*IRB.GetInsertPoint(), CsiBeforeLoop, {LoopCsiId, TripCount, + LoopPropVal}); + + // Insert loop-body-entry hook. + IRB.SetInsertPoint(&*Header->getFirstInsertionPt()); + // TODO: Pass IVs to hook? + insertHookCall(&*IRB.GetInsertPoint(), CsiLoopBodyEntry, {LoopCsiId, + LoopPropVal}); + + // Insert hooks at the ends of the exiting blocks. + for (BasicBlock *BB : ExitingBlocks) { + // Record properties of this loop exit + CsiLoopExitProperty LoopExitProp; + LoopExitProp.setIsLatch(L.isLoopLatch(BB)); + + // Insert the loop-exit hook + IRB.SetInsertPoint(BB->getTerminator()); + csi_id_t LocalExitId = LoopExitFED.add(*BB); + Value *ExitCsiId = LoopFED.localToGlobalId(LocalExitId, IRB); + Value *LoopExitPropVal = LoopExitProp.getValue(IRB); + // TODO: For latches, record whether the loop will repeat. + insertHookCall(&*IRB.GetInsertPoint(), CsiLoopBodyExit, + {ExitCsiId, LoopCsiId, LoopExitPropVal}); + } + // Insert after-loop hooks. + for (BasicBlock *BB : ExitBlocks) { + IRB.SetInsertPoint(&*BB->getFirstInsertionPt()); + insertHookCall(&*IRB.GetInsertPoint(), CsiAfterLoop, {LoopCsiId, + LoopPropVal}); + } +} + +void CSIImpl::instrumentCallsite(Instruction *I, DominatorTree *DT) { + if (callsPlaceholderFunction(*I)) + return; + + bool IsInvoke = isa(I); + Function *Called = nullptr; + if (CallInst *CI = dyn_cast(I)) + Called = CI->getCalledFunction(); + else if (InvokeInst *II = dyn_cast(I)) + Called = II->getCalledFunction(); + + bool shouldInstrumentBefore = true; + bool shouldInstrumentAfter = true; + + // Does this call require instrumentation before or after? + if (Called) { + shouldInstrumentBefore = Config->DoesFunctionRequireInstrumentationForPoint( + Called->getName(), InstrumentationPoint::INSTR_BEFORE_CALL); + shouldInstrumentAfter = Config->DoesFunctionRequireInstrumentationForPoint( + Called->getName(), InstrumentationPoint::INSTR_AFTER_CALL); + } + + if (!shouldInstrumentAfter && !shouldInstrumentBefore) + return; + + IRBuilder<> IRB(I); + Value *DefaultID = getDefaultID(IRB); + uint64_t LocalId = CallsiteFED.add(*I, Called ? Called->getName() : ""); + Value *CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB); + Value *FuncId = nullptr; + GlobalVariable *FuncIdGV = nullptr; + if (Called) { + std::string GVName = CsiFuncIdVariablePrefix + Called->getName().str(); + Type *FuncIdGVTy = IRB.getInt64Ty(); + FuncIdGV = dyn_cast( + M.getOrInsertGlobal(GVName, FuncIdGVTy)); + assert(FuncIdGV); + FuncIdGV->setConstant(false); + if (Options.jitMode && !Called->empty()) + FuncIdGV->setLinkage(Called->getLinkage()); + else + FuncIdGV->setLinkage(GlobalValue::WeakAnyLinkage); + FuncIdGV->setInitializer(IRB.getInt64(CsiCallsiteUnknownTargetId)); + FuncId = IRB.CreateLoad(FuncIdGVTy, FuncIdGV); + } else { + // Unknown targets (i.e. indirect calls) are always unknown. + FuncId = IRB.getInt64(CsiCallsiteUnknownTargetId); + } + assert(FuncId != NULL); + CsiCallProperty Prop; + Value *DefaultPropVal = Prop.getValue(IRB); + Prop.setIsIndirect(!Called); + Value *PropVal = Prop.getValue(IRB); + if (shouldInstrumentBefore) + insertHookCall(I, CsiBeforeCallsite, {CallsiteId, FuncId, PropVal}); + + BasicBlock::iterator Iter(I); + if (shouldInstrumentAfter) { + if (IsInvoke) { + // There are two "after" positions for invokes: the normal block and the + // exception block. + InvokeInst *II = cast(I); + insertHookCallInSuccessorBB(II->getNormalDest(), II->getParent(), + CsiAfterCallsite, + {CallsiteId, FuncId, PropVal}, + {DefaultID, DefaultID, DefaultPropVal}); + insertHookCallInSuccessorBB(II->getUnwindDest(), II->getParent(), + CsiAfterCallsite, + {CallsiteId, FuncId, PropVal}, + {DefaultID, DefaultID, DefaultPropVal}); + } else { + // Simple call instruction; there is only one "after" position. + Iter++; + IRB.SetInsertPoint(&*Iter); + PropVal = Prop.getValue(IRB); + insertHookCall(&*Iter, CsiAfterCallsite, {CallsiteId, FuncId, PropVal}); + } + } +} + +void CSIImpl::interposeCall(Instruction *I) { + CallBase *CB = dyn_cast(I); + if (!CB) + return; + + Function *Called = CB->getCalledFunction(); + + // Should we interpose this call? + if (Called && Called->getName().size() > 0) { + bool shouldInterpose = + Config->DoesFunctionRequireInterposition(Called->getName()); + + if (shouldInterpose) { + Function *interpositionFunction = getInterpositionFunction(Called); + assert(interpositionFunction != nullptr); + CB->setCalledFunction(interpositionFunction); + } + } +} + +static void getTaskExits(DetachInst *DI, + SmallVectorImpl &TaskReturns, + SmallVectorImpl &TaskResumes, + SmallVectorImpl &SharedEHExits, + TaskInfo &TI) { + BasicBlock *DetachedBlock = DI->getDetached(); + Task *T = TI.getTaskFor(DetachedBlock); + BasicBlock *ContinueBlock = DI->getContinue(); + + // Examine the predecessors of the continue block and save any predecessors in + // the task as a task return. + for (BasicBlock *Pred : predecessors(ContinueBlock)) { + if (T->simplyEncloses(Pred)) { + assert(isa(Pred->getTerminator())); + TaskReturns.push_back(Pred); + } + } + + // If the detach cannot throw, we're done. + if (!DI->hasUnwindDest()) + return; + + // Detached-rethrow exits can appear in strange places within a task-exiting + // spindle. Hence we loop over all blocks in the spindle to find + // detached rethrows. + for (Spindle *S : depth_first>(T->getEntrySpindle())) { + if (S->isSharedEH()) { + if (llvm::any_of(predecessors(S), + [](const Spindle *Pred) { return !Pred->isSharedEH(); })) + SharedEHExits.push_back(S); + continue; + } + + for (BasicBlock *B : S->blocks()) + if (isDetachedRethrow(B->getTerminator())) + TaskResumes.push_back(B); + } +} + +BasicBlock::iterator +CSIImpl::getFirstInsertionPtInDetachedBlock(BasicBlock *Detached) { + for (Instruction &I : *Detached) + if (IntrinsicInst *II = dyn_cast(&I)) + if (Intrinsic::taskframe_use == II->getIntrinsicID()) + return ++(II->getIterator()); + return Detached->getFirstInsertionPt(); +} + +void CSIImpl::instrumentDetach(DetachInst *DI, unsigned SyncRegNum, + unsigned NumSyncRegs, DominatorTree *DT, + TaskInfo &TI, LoopInfo &LI) { + LLVMContext &Ctx = DI->getContext(); + BasicBlock *TaskEntryBlock = TI.getTaskFor(DI->getParent())->getEntry(); + IRBuilder<> IDBuilder(getEntryBBInsertPt(*TaskEntryBlock)); + bool TapirLoopBody = spawnsTapirLoopBody(DI, LI, TI); + ConstantInt *SyncRegVal = ConstantInt::get(Type::getInt32Ty(Ctx), SyncRegNum); + ConstantInt *DefaultSyncRegVal = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + CsiDetachProperty DetachProp; + DetachProp.setForTapirLoopBody(TapirLoopBody); + // Instrument the detach instruction itself + Value *DetachID; + { + IRBuilder<> IRB(DI); + uint64_t LocalID = DetachFED.add(*DI); + DetachID = DetachFED.localToGlobalId(LocalID, IDBuilder); + insertHookCall(DI, CsiDetach, + {DetachID, SyncRegVal, DetachProp.getValue(IRB)}); + } + + // Find the detached block, continuation, and associated reattaches. + BasicBlock *DetachedBlock = DI->getDetached(); + BasicBlock *ContinueBlock = DI->getContinue(); + Task *T = TI.getTaskFor(DetachedBlock); + SmallVector TaskExits, TaskResumes; + SmallVector SharedEHExits; + getTaskExits(DI, TaskExits, TaskResumes, SharedEHExits, TI); + + // Instrument the entry and exit points of the detached task. + { + // Instrument the entry point of the detached task. + IRBuilder<> IRB(&*getFirstInsertionPtInDetachedBlock(DetachedBlock)); + uint64_t LocalID = TaskFED.add(*DetachedBlock); + Value *TaskID = TaskFED.localToGlobalId(LocalID, IDBuilder); + CsiTaskProperty Prop; + Prop.setIsTapirLoopBody(TapirLoopBody); + Prop.setNumSyncReg(NumSyncRegs); + Instruction *Call = IRB.CreateCall(CsiTaskEntry, {TaskID, DetachID, + Prop.getValue(IRB)}); + setInstrumentationDebugLoc(*DetachedBlock, Call); + + // Instrument the exit points of the detached tasks. + for (BasicBlock *Exit : TaskExits) { + IRBuilder<> IRB(Exit->getTerminator()); + uint64_t LocalID = TaskExitFED.add(*Exit->getTerminator()); + Value *ExitID = TaskExitFED.localToGlobalId(LocalID, IDBuilder); + CsiTaskExitProperty ExitProp; + ExitProp.setIsTapirLoopBody(TapirLoopBody); + insertHookCall( + Exit->getTerminator(), CsiTaskExit, + {ExitID, TaskID, DetachID, SyncRegVal, ExitProp.getValue(IRB)}); + } + // Instrument the EH exits of the detached task. + for (BasicBlock *Exit : TaskResumes) { + IRBuilder<> IRB(Exit->getTerminator()); + uint64_t LocalID = TaskExitFED.add(*Exit->getTerminator()); + Value *ExitID = TaskExitFED.localToGlobalId(LocalID, IDBuilder); + CsiTaskExitProperty ExitProp; + ExitProp.setIsTapirLoopBody(TapirLoopBody); + insertHookCall( + Exit->getTerminator(), CsiTaskExit, + {ExitID, TaskID, DetachID, SyncRegVal, ExitProp.getValue(IRB)}); + } + + Value *DefaultID = getDefaultID(IDBuilder); + for (Spindle *SharedEH : SharedEHExits) { + // Skip shared-eh spindle exits that are placeholder unreachable blocks. + if (isa( + SharedEH->getEntry()->getFirstNonPHIOrDbgOrLifetime())) + continue; + CsiTaskExitProperty ExitProp; + ExitProp.setIsTapirLoopBody(TapirLoopBody); + insertHookCallAtSharedEHSpindleExits( + SharedEH, T, CsiTaskExit, TaskExitFED, + {TaskID, DetachID, SyncRegVal, ExitProp.getValueImpl(Ctx)}, + {DefaultID, DefaultID, DefaultSyncRegVal, + CsiTaskExitProperty::getDefaultValueImpl(Ctx)}); + } + } + + // Instrument the continuation of the detach. + { + if (isCriticalContinueEdge(DI, 1)) + ContinueBlock = SplitCriticalEdge( + DI, 1, + CriticalEdgeSplittingOptions(DT, &LI).setSplitDetachContinue()); + + IRBuilder<> IRB(&*ContinueBlock->getFirstInsertionPt()); + uint64_t LocalID = DetachContinueFED.add(*ContinueBlock); + Value *ContinueID = DetachContinueFED.localToGlobalId(LocalID, IDBuilder); + CsiDetachContinueProperty ContProp; + ContProp.setForTapirLoopBody(TapirLoopBody); + Instruction *Call = + IRB.CreateCall(CsiDetachContinue, {ContinueID, DetachID, SyncRegVal, + ContProp.getValue(IRB)}); + setInstrumentationDebugLoc(*ContinueBlock, Call); + } + // Instrument the unwind of the detach, if it exists. + if (DI->hasUnwindDest()) { + BasicBlock *UnwindBlock = DI->getUnwindDest(); + BasicBlock *PredBlock = DI->getParent(); + if (Value *TF = T->getTaskFrameUsed()) { + // If the detached task uses a taskframe, then we want to insert the + // detach_continue instrumentation for the unwind destination after the + // taskframe.resume. + UnwindBlock = getTaskFrameResumeDest(TF); + assert(UnwindBlock && + "Detach with unwind uses a taskframe with no resume"); + PredBlock = getTaskFrameResume(TF)->getParent(); + } + Value *DefaultID = getDefaultID(IDBuilder); + uint64_t LocalID = DetachContinueFED.add(*UnwindBlock); + Value *ContinueID = DetachContinueFED.localToGlobalId(LocalID, IDBuilder); + CsiDetachContinueProperty ContProp; + Value *DefaultPropVal = ContProp.getValueImpl(Ctx); + ContProp.setIsUnwind(); + ContProp.setForTapirLoopBody(TapirLoopBody); + insertHookCallInSuccessorBB( + UnwindBlock, PredBlock, CsiDetachContinue, + {ContinueID, DetachID, SyncRegVal, ContProp.getValue(Ctx)}, + {DefaultID, DefaultID, DefaultSyncRegVal, DefaultPropVal}); + for (BasicBlock *DRPred : predecessors(UnwindBlock)) + if (isDetachedRethrow(DRPred->getTerminator(), DI->getSyncRegion())) + insertHookCallInSuccessorBB( + UnwindBlock, DRPred, CsiDetachContinue, + {ContinueID, DetachID, SyncRegVal, ContProp.getValue(Ctx)}, + {DefaultID, DefaultID, DefaultSyncRegVal, DefaultPropVal}); + } +} + +void CSIImpl::instrumentSync(SyncInst *SI, unsigned SyncRegNum) { + LLVMContext &Ctx = SI->getContext(); + IRBuilder<> IRB(SI); + Value *DefaultID = getDefaultID(IRB); + // Get the ID of this sync. + uint64_t LocalID = SyncFED.add(*SI); + Value *SyncID = SyncFED.localToGlobalId(LocalID, IRB); + ConstantInt *SyncRegVal = ConstantInt::get(Type::getInt32Ty(Ctx), SyncRegNum); + ConstantInt *DefaultSyncRegVal = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + + // Insert instrumentation before the sync. + insertHookCall(SI, CsiBeforeSync, {SyncID, SyncRegVal}); + BasicBlock *SyncBB = SI->getParent(); + BasicBlock *SyncCont = SI->getSuccessor(0); + BasicBlock *SyncUnwind = nullptr; + if (SyncsWithUnwinds.count(SI)) { + InvokeInst *II = dyn_cast(SyncCont->getTerminator()); + SyncBB = SyncCont; + SyncUnwind = II->getUnwindDest(); + SyncCont = II->getNormalDest(); + } + + insertHookCallInSuccessorBB(SyncCont, SyncBB, CsiAfterSync, + {SyncID, SyncRegVal}, + {DefaultID, DefaultSyncRegVal}); + + // If we have no unwind for the sync, then we're done. + if (!SyncUnwind) + return; + + insertHookCallInSuccessorBB(SyncUnwind, SyncBB, CsiAfterSync, + {SyncID, SyncRegVal}, + {DefaultID, DefaultSyncRegVal}); +} + +void CSIImpl::instrumentAlloca(Instruction *I, TaskInfo &TI) { + IRBuilder<> IRB(I); + bool AllocaInEntryBlock = isEntryBlock(*I->getParent(), TI); + if (AllocaInEntryBlock) + IRB.SetInsertPoint(getEntryBBInsertPt(*I->getParent())); + AllocaInst *AI = cast(I); + + uint64_t LocalId = AllocaFED.add(*I); + Value *CsiId = AllocaFED.localToGlobalId(LocalId, IRB); + + CsiAllocaProperty Prop; + Prop.setIsStatic(AI->isStaticAlloca()); + Value *PropVal = Prop.getValue(IRB); + + // Get size of allocation. + uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType()); + Value *SizeVal = IRB.getInt64(Size); + if (AI->isArrayAllocation()) + SizeVal = IRB.CreateMul(SizeVal, + IRB.CreateZExtOrBitCast(AI->getArraySize(), + IRB.getInt64Ty())); + + BasicBlock::iterator Iter(I); + if (!AllocaInEntryBlock) { + Iter++; + IRB.SetInsertPoint(&*Iter); + } else { + Iter = IRB.GetInsertPoint(); + } + + Type *AddrType = IRB.getInt8PtrTy(); + Value *Addr = IRB.CreatePointerCast(I, AddrType); + insertHookCall(&*Iter, CsiAfterAlloca, {CsiId, Addr, SizeVal, PropVal}); +} + +bool CSIImpl::getAllocFnArgs(const Instruction *I, + SmallVectorImpl &AllocFnArgs, + Type *SizeTy, Type *AddrTy, + const TargetLibraryInfo &TLI) { + const CallBase *CB = dyn_cast(I); + + std::pair SizeArgs = getAllocSizeArgs(CB, &TLI); + // If the first size argument is null, then we failed to get size arguments + // for this call. + if (!SizeArgs.first) + return false; + + Value *AlignmentArg = getAllocAlignment(CB, &TLI); + + // Push the size arguments. + AllocFnArgs.push_back(SizeArgs.first); + // The second size argument is the number of elements allocated (i.e., for + // calloc-like functions). + if (SizeArgs.second) + AllocFnArgs.push_back(SizeArgs.second); + else + // Report number of elements == 1. + AllocFnArgs.push_back(ConstantInt::get(SizeTy, 1)); + + // Push the alignment argument or 0 if there is no alignment argument. + if (AlignmentArg) + AllocFnArgs.push_back(AlignmentArg); + else + AllocFnArgs.push_back(ConstantInt::get(SizeTy, 0)); + + // Return the old pointer argument for realloc-like functions or nullptr for + // other allocation functions. + if (Value *Reallocated = getReallocatedOperand(CB)) + AllocFnArgs.push_back(Reallocated); + else + AllocFnArgs.push_back(Constant::getNullValue(AddrTy)); + + return true; +} + +void CSIImpl::instrumentAllocFn(Instruction *I, DominatorTree *DT, + const TargetLibraryInfo *TLI) { + bool IsInvoke = isa(I); + Function *Called = nullptr; + if (CallInst *CI = dyn_cast(I)) + Called = CI->getCalledFunction(); + else if (InvokeInst *II = dyn_cast(I)) + Called = II->getCalledFunction(); + + assert(Called && "Could not get called function for allocation fn."); + + IRBuilder<> IRB(I); + Value *DefaultID = getDefaultID(IRB); + uint64_t LocalId = AllocFnFED.add(*I); + Value *AllocFnId = AllocFnFED.localToGlobalId(LocalId, IRB); + + SmallVector AllocFnArgs; + getAllocFnArgs(I, AllocFnArgs, IntptrTy, IRB.getInt8PtrTy(), *TLI); + SmallVector DefaultAllocFnArgs({ + /* Allocated size */ Constant::getNullValue(IntptrTy), + /* Number of elements */ Constant::getNullValue(IntptrTy), + /* Alignment */ Constant::getNullValue(IntptrTy), + /* Old pointer */ Constant::getNullValue(IRB.getInt8PtrTy()), + }); + + CsiAllocFnProperty Prop; + Value *DefaultPropVal = Prop.getValue(IRB); + LibFunc AllocLibF; + TLI->getLibFunc(*Called, AllocLibF); + Prop.setAllocFnTy(static_cast(getAllocFnTy(AllocLibF))); + AllocFnArgs.push_back(Prop.getValue(IRB)); + DefaultAllocFnArgs.push_back(DefaultPropVal); + + BasicBlock::iterator Iter(I); + if (IsInvoke) { + // There are two "after" positions for invokes: the normal block and the + // exception block. + InvokeInst *II = cast(I); + + BasicBlock *NormalBB = II->getNormalDest(); + unsigned SuccNum = GetSuccessorNumber(II->getParent(), NormalBB); + if (isCriticalEdge(II, SuccNum)) + NormalBB = + SplitCriticalEdge(II, SuccNum, CriticalEdgeSplittingOptions(DT)); + // Insert hook into normal destination. + { + IRB.SetInsertPoint(&*NormalBB->getFirstInsertionPt()); + SmallVector AfterAllocFnArgs; + AfterAllocFnArgs.push_back(AllocFnId); + AfterAllocFnArgs.push_back(IRB.CreatePointerCast(I, IRB.getInt8PtrTy())); + AfterAllocFnArgs.append(AllocFnArgs.begin(), AllocFnArgs.end()); + insertHookCall(&*IRB.GetInsertPoint(), CsiAfterAllocFn, AfterAllocFnArgs); + } + // Insert hook into unwind destination. + { + // The return value of the allocation function is not valid in the unwind + // destination. + SmallVector AfterAllocFnArgs, DefaultAfterAllocFnArgs; + AfterAllocFnArgs.push_back(AllocFnId); + AfterAllocFnArgs.push_back(Constant::getNullValue(IRB.getInt8PtrTy())); + AfterAllocFnArgs.append(AllocFnArgs.begin(), AllocFnArgs.end()); + DefaultAfterAllocFnArgs.push_back(DefaultID); + DefaultAfterAllocFnArgs.push_back( + Constant::getNullValue(IRB.getInt8PtrTy())); + DefaultAfterAllocFnArgs.append(DefaultAllocFnArgs.begin(), + DefaultAllocFnArgs.end()); + insertHookCallInSuccessorBB(II->getUnwindDest(), II->getParent(), + CsiAfterAllocFn, AfterAllocFnArgs, + DefaultAfterAllocFnArgs); + } + } else { + // Simple call instruction; there is only one "after" position. + Iter++; + IRB.SetInsertPoint(&*Iter); + SmallVector AfterAllocFnArgs; + AfterAllocFnArgs.push_back(AllocFnId); + AfterAllocFnArgs.push_back(IRB.CreatePointerCast(I, IRB.getInt8PtrTy())); + AfterAllocFnArgs.append(AllocFnArgs.begin(), AllocFnArgs.end()); + insertHookCall(&*Iter, CsiAfterAllocFn, AfterAllocFnArgs); + } +} + +void CSIImpl::instrumentFree(Instruction *I, const TargetLibraryInfo *TLI) { + // It appears that frees (and deletes) never throw. + assert(isa(I) && "Free call is not a call instruction"); + + CallInst *FC = cast(I); + Function *Called = FC->getCalledFunction(); + assert(Called && "Could not get called function for free."); + + IRBuilder<> IRB(I); + uint64_t LocalId = FreeFED.add(*I); + Value *FreeId = FreeFED.localToGlobalId(LocalId, IRB); + + // All currently supported free functions free the first argument. + Value *Addr = FC->getArgOperand(0); + CsiFreeProperty Prop; + LibFunc FreeLibF; + TLI->getLibFunc(*Called, FreeLibF); + Prop.setFreeTy(static_cast(getFreeTy(FreeLibF))); + + insertHookCall(I, CsiBeforeFree, {FreeId, Addr, Prop.getValue(IRB)}); + BasicBlock::iterator Iter(I); + Iter++; + insertHookCall(&*Iter, CsiAfterFree, {FreeId, Addr, Prop.getValue(IRB)}); +} + +CallInst *CSIImpl::insertHookCall(Instruction *I, FunctionCallee HookFunction, + ArrayRef HookArgs) { + IRBuilder<> IRB(I); + CallInst *Call = IRB.CreateCall(HookFunction, HookArgs); + setInstrumentationDebugLoc(I, (Instruction *)Call); + return Call; +} + +bool CSIImpl::updateArgPHIs(BasicBlock *Succ, BasicBlock *BB, + FunctionCallee HookFunction, + ArrayRef HookArgs, + ArrayRef DefaultArgs) { + // If we've already created a PHI node in this block for the hook arguments, + // just add the incoming arguments to the PHIs. + auto Key = std::make_pair(Succ, cast(HookFunction.getCallee())); + if (ArgPHIs.count(Key)) { + unsigned HookArgNum = 0; + for (PHINode *ArgPHI : ArgPHIs[Key]) { + ArgPHI->setIncomingValue(ArgPHI->getBasicBlockIndex(BB), + HookArgs[HookArgNum]); + ++HookArgNum; + } + return true; + } + + // Create PHI nodes in this block for each hook argument. + IRBuilder<> IRB(&Succ->front()); + unsigned HookArgNum = 0; + for (Value *Arg : HookArgs) { + PHINode *ArgPHI = IRB.CreatePHI(Arg->getType(), 2); + for (BasicBlock *Pred : predecessors(Succ)) { + if (Pred == BB) + ArgPHI->addIncoming(Arg, BB); + else + ArgPHI->addIncoming(DefaultArgs[HookArgNum], Pred); + } + ArgPHIs[Key].push_back(ArgPHI); + ++HookArgNum; + } + return false; +} + +CallInst *CSIImpl::insertHookCallInSuccessorBB(BasicBlock *Succ, BasicBlock *BB, + FunctionCallee HookFunction, + ArrayRef HookArgs, + ArrayRef DefaultArgs) { + assert(HookFunction && "No hook function given."); + // If this successor block has a unique predecessor, just insert the hook call + // as normal. + if (Succ->getUniquePredecessor()) { + assert(Succ->getUniquePredecessor() == BB && + "BB is not unique predecessor of successor block"); + return insertHookCall(&*Succ->getFirstInsertionPt(), HookFunction, + HookArgs); + } + + if (updateArgPHIs(Succ, BB, HookFunction, HookArgs, DefaultArgs)) + return nullptr; + + auto Key = std::make_pair(Succ, cast(HookFunction.getCallee())); + SmallVector SuccessorHookArgs; + for (PHINode *ArgPHI : ArgPHIs[Key]) + SuccessorHookArgs.push_back(ArgPHI); + + IRBuilder<> IRB(&*Succ->getFirstInsertionPt()); + // Insert the hook call, using the PHI as the CSI ID. + CallInst *Call = IRB.CreateCall(HookFunction, SuccessorHookArgs); + setInstrumentationDebugLoc(*Succ, (Instruction *)Call); + + return Call; +} + +void CSIImpl::insertHookCallAtSharedEHSpindleExits( + Spindle *SharedEHSpindle, Task *T, FunctionCallee HookFunction, + FrontEndDataTable &FED, ArrayRef HookArgs, + ArrayRef DefaultArgs) { + // Get the set of shared EH spindles to examine. Store them in post order, so + // they can be evaluated in reverse post order. + SmallVector WorkList; + for (Spindle *S : post_order>(SharedEHSpindle)) + WorkList.push_back(S); + + // Traverse the shared-EH spindles in reverse post order, updating the + // hook-argument PHI's along the way. + SmallPtrSet Visited; + for (Spindle *S : llvm::reverse(WorkList)) { + bool NoNewPHINode = true; + // If this spindle is the first shared-EH spindle in the traversal, use the + // given hook arguments to update the PHI node. + if (S == SharedEHSpindle) { + for (Spindle::SpindleEdge &InEdge : S->in_edges()) { + Spindle *SPred = InEdge.first; + BasicBlock *Pred = InEdge.second; + if (T->contains(SPred)) + NoNewPHINode &= + updateArgPHIs(S->getEntry(), Pred, HookFunction, HookArgs, + DefaultArgs); + } + } else { + // Otherwise update the PHI node based on the predecessor shared-eh + // spindles in this RPO traversal. + for (Spindle::SpindleEdge &InEdge : S->in_edges()) { + Spindle *SPred = InEdge.first; + BasicBlock *Pred = InEdge.second; + if (Visited.count(SPred)) { + auto Key = std::make_pair(SPred->getEntry(), + cast(HookFunction.getCallee())); + SmallVector NewHookArgs( + ArgPHIs[Key].begin(), ArgPHIs[Key].end()); + NoNewPHINode &= + updateArgPHIs(S->getEntry(), Pred, HookFunction, NewHookArgs, + DefaultArgs); + } + } + } + Visited.insert(S); + + if (NoNewPHINode) + continue; + + // Detached-rethrow exits can appear in strange places within a task-exiting + // spindle. Hence we loop over all blocks in the spindle to find detached + // rethrows. + auto Key = std::make_pair(S->getEntry(), + cast(HookFunction.getCallee())); + for (BasicBlock *B : S->blocks()) { + if (isDetachedRethrow(B->getTerminator())) { + IRBuilder<> IRB(B->getTerminator()); + uint64_t LocalID = FED.add(*B->getTerminator()); + Value *HookID = FED.localToGlobalId(LocalID, IRB); + SmallVector Args({HookID}); + Args.append(ArgPHIs[Key].begin(), ArgPHIs[Key].end()); + Instruction *Call = IRB.CreateCall(HookFunction, Args); + setInstrumentationDebugLoc(*B, Call); + } + } + } +} + +void CSIImpl::initializeFEDTables() { + FunctionFED = FrontEndDataTable(M, CsiFunctionBaseIdName, + "__csi_unit_fed_table_function", + "__csi_unit_function_name_", + /*UseExistingBaseId=*/false); + FunctionExitFED = FrontEndDataTable(M, CsiFunctionExitBaseIdName, + "__csi_unit_fed_table_function_exit", + "__csi_unit_function_name_"); + LoopFED = FrontEndDataTable(M, CsiLoopBaseIdName, + "__csi_unit_fed_table_loop"); + LoopExitFED = FrontEndDataTable(M, CsiLoopExitBaseIdName, + "__csi_unit_fed_table_loop"); + BasicBlockFED = FrontEndDataTable(M, CsiBasicBlockBaseIdName, + "__csi_unit_fed_table_basic_block"); + CallsiteFED = FrontEndDataTable(M, CsiCallsiteBaseIdName, + "__csi_unit_fed_table_callsite", + "__csi_unit_function_name_"); + LoadFED = FrontEndDataTable(M, CsiLoadBaseIdName, + "__csi_unit_fed_table_load"); + StoreFED = FrontEndDataTable(M, CsiStoreBaseIdName, + "__csi_unit_fed_table_store"); + AllocaFED = FrontEndDataTable(M, CsiAllocaBaseIdName, + "__csi_unit_fed_table_alloca", + "__csi_unit_variable_name_"); + DetachFED = FrontEndDataTable(M, CsiDetachBaseIdName, + "__csi_unit_fed_table_detach"); + TaskFED = FrontEndDataTable(M, CsiTaskBaseIdName, + "__csi_unit_fed_table_task"); + TaskExitFED = FrontEndDataTable(M, CsiTaskExitBaseIdName, + "__csi_unit_fed_table_task_exit"); + DetachContinueFED = FrontEndDataTable(M, CsiDetachContinueBaseIdName, + "__csi_unit_fed_table_detach_continue"); + SyncFED = FrontEndDataTable(M, CsiSyncBaseIdName, + "__csi_unit_fed_table_sync"); + AllocFnFED = FrontEndDataTable(M, CsiAllocFnBaseIdName, + "__csi_unit_fed_table_allocfn", + "__csi_unit_variable_name_"); + FreeFED = FrontEndDataTable(M, CsiFreeBaseIdName, + "__csi_unit_fed_free"); +} + +void CSIImpl::initializeSizeTables() { + BBSize = SizeTable(M, CsiBasicBlockBaseIdName); +} + +uint64_t CSIImpl::getLocalFunctionID(Function &F) { + uint64_t LocalId = FunctionFED.add(F); + FuncOffsetMap[F.getName()] = LocalId; + return LocalId; +} + +void CSIImpl::generateInitCallsiteToFunction() { + LLVMContext &C = M.getContext(); + BasicBlock *EntryBB = BasicBlock::Create(C, "", InitCallsiteToFunction); + IRBuilder<> IRB(ReturnInst::Create(C, EntryBB)); + + GlobalVariable *Base = FunctionFED.baseId(); + Type *BaseTy = IRB.getInt64Ty(); + LoadInst *LI = IRB.CreateLoad(BaseTy, Base); + // Traverse the map of function name -> function local id. Generate + // a store of each function's global ID to the corresponding weak + // global variable. + for (const auto &it : FuncOffsetMap) { + std::string GVName = CsiFuncIdVariablePrefix + it.first.str(); + GlobalVariable *GV = nullptr; + if ((GV = M.getGlobalVariable(GVName)) == nullptr) { + GV = new GlobalVariable(M, IRB.getInt64Ty(), false, + (Options.jitMode ? GlobalValue::ExternalLinkage : + GlobalValue::WeakAnyLinkage), + IRB.getInt64(CsiCallsiteUnknownTargetId), GVName); + } + assert(GV); + IRB.CreateStore(IRB.CreateAdd(LI, IRB.getInt64(it.second)), GV); + } +} + +void CSIImpl::initializeCsi() { + IntptrTy = DL.getIntPtrType(M.getContext()); + + initializeFEDTables(); + initializeSizeTables(); + if (Options.InstrumentFuncEntryExit) + initializeFuncHooks(); + if (Options.InstrumentMemoryAccesses) + initializeLoadStoreHooks(); + if (Options.InstrumentLoops) + initializeLoopHooks(); + if (Options.InstrumentBasicBlocks) + initializeBasicBlockHooks(); + if (Options.InstrumentCalls) + initializeCallsiteHooks(); + if (Options.InstrumentMemIntrinsics) + initializeMemIntrinsicsHooks(); + if (Options.InstrumentTapir) + initializeTapirHooks(); + if (Options.InstrumentAllocas) + initializeAllocaHooks(); + if (Options.InstrumentAllocFns) + initializeAllocFnHooks(); + + FunctionType *FnType = + FunctionType::get(Type::getVoidTy(M.getContext()), {}, false); + InitCallsiteToFunction = cast(M.getOrInsertFunction( + CsiInitCallsiteToFunctionName, + FnType) + .getCallee()); + assert(InitCallsiteToFunction); + + InitCallsiteToFunction->setLinkage(GlobalValue::InternalLinkage); + + /* + The runtime declares this as a __thread var --- need to change this decl + generation or the tool won't compile DisableInstrGV = new GlobalVariable(M, + IntegerType::get(M.getContext(), 1), false, GlobalValue::ExternalLinkage, + nullptr, CsiDisableInstrumentationName, nullptr, + GlobalValue::GeneralDynamicTLSModel, 0, + true); + */ +} + +// Create a struct type to match the unit_fed_entry_t type in csirt.c. +StructType *CSIImpl::getUnitFedTableType(LLVMContext &C, + PointerType *EntryPointerType) { + return StructType::get(IntegerType::get(C, 64), Type::getInt8PtrTy(C, 0), + EntryPointerType); +} + +Constant *CSIImpl::fedTableToUnitFedTable(Module &M, + StructType *UnitFedTableType, + FrontEndDataTable &FedTable) { + Constant *NumEntries = + ConstantInt::get(IntegerType::get(M.getContext(), 64), FedTable.size()); + Constant *BaseIdPtr = ConstantExpr::getPointerCast( + FedTable.baseId(), Type::getInt8PtrTy(M.getContext(), 0)); + Constant *InsertedTable = FedTable.insertIntoModule(M); + return ConstantStruct::get(UnitFedTableType, NumEntries, BaseIdPtr, + InsertedTable); +} + +void CSIImpl::collectUnitFEDTables() { + LLVMContext &C = M.getContext(); + StructType *UnitFedTableType = + getUnitFedTableType(C, FrontEndDataTable::getPointerType(C)); + + // The order of the FED tables here must match the enum in csirt.c and the + // instrumentation_counts_t in csi.h. + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, FunctionFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, FunctionExitFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, LoopFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, LoopExitFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, BasicBlockFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, CallsiteFED)); + UnitFedTables.push_back(fedTableToUnitFedTable(M, UnitFedTableType, LoadFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, StoreFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, DetachFED)); + UnitFedTables.push_back(fedTableToUnitFedTable(M, UnitFedTableType, TaskFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, TaskExitFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, DetachContinueFED)); + UnitFedTables.push_back(fedTableToUnitFedTable(M, UnitFedTableType, SyncFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, AllocaFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, AllocFnFED)); + UnitFedTables.push_back(fedTableToUnitFedTable(M, UnitFedTableType, FreeFED)); +} + +// Create a struct type to match the unit_obj_entry_t type in csirt.c. +StructType *CSIImpl::getUnitSizeTableType(LLVMContext &C, + PointerType *EntryPointerType) { + return StructType::get(IntegerType::get(C, 64), EntryPointerType); +} + +Constant *CSIImpl::sizeTableToUnitSizeTable(Module &M, + StructType *UnitSizeTableType, + SizeTable &SzTable) { + Constant *NumEntries = + ConstantInt::get(IntegerType::get(M.getContext(), 64), SzTable.size()); + // Constant *BaseIdPtr = + // ConstantExpr::getPointerCast(FedTable.baseId(), + // Type::getInt8PtrTy(M.getContext(), 0)); + Constant *InsertedTable = SzTable.insertIntoModule(M); + return ConstantStruct::get(UnitSizeTableType, NumEntries, InsertedTable); +} + +void CSIImpl::collectUnitSizeTables() { + LLVMContext &C = M.getContext(); + StructType *UnitSizeTableType = + getUnitSizeTableType(C, SizeTable::getPointerType(C)); + + UnitSizeTables.push_back( + sizeTableToUnitSizeTable(M, UnitSizeTableType, BBSize)); +} + +CallInst *CSIImpl::createRTUnitInitCall(IRBuilder<> &IRB) { + LLVMContext &C = M.getContext(); + + StructType *UnitFedTableType = + getUnitFedTableType(C, FrontEndDataTable::getPointerType(C)); + StructType *UnitSizeTableType = + getUnitSizeTableType(C, SizeTable::getPointerType(C)); + + // Lookup __csirt_unit_init + SmallVector InitArgTypes({IRB.getInt8PtrTy(), + PointerType::get(UnitFedTableType, 0), + PointerType::get(UnitSizeTableType, 0), + InitCallsiteToFunction->getType()}); + FunctionType *InitFunctionTy = + FunctionType::get(IRB.getVoidTy(), InitArgTypes, false); + RTUnitInit = M.getOrInsertFunction(CsiRtUnitInitName, InitFunctionTy); + assert(isa(RTUnitInit.getCallee()) && + "Failed to get or insert __csirt_unit_init function"); + + ArrayType *UnitFedTableArrayType = + ArrayType::get(UnitFedTableType, UnitFedTables.size()); + Constant *FEDTable = ConstantArray::get(UnitFedTableArrayType, UnitFedTables); + GlobalVariable *FEDGV = new GlobalVariable( + M, UnitFedTableArrayType, false, GlobalValue::InternalLinkage, FEDTable, + CsiUnitFedTableArrayName); + ArrayType *UnitSizeTableArrayType = + ArrayType::get(UnitSizeTableType, UnitSizeTables.size()); + Constant *SzTable = + ConstantArray::get(UnitSizeTableArrayType, UnitSizeTables); + GlobalVariable *SizeGV = new GlobalVariable( + M, UnitSizeTableArrayType, false, GlobalValue::InternalLinkage, SzTable, + CsiUnitSizeTableArrayName); + + Constant *Zero = ConstantInt::get(IRB.getInt32Ty(), 0); + Value *GepArgs[] = {Zero, Zero}; + + // Insert call to __csirt_unit_init + return IRB.CreateCall( + RTUnitInit, + {IRB.CreateGlobalStringPtr(M.getName(), "__csi_module_name"), + ConstantExpr::getGetElementPtr(FEDGV->getValueType(), FEDGV, GepArgs), + ConstantExpr::getGetElementPtr(SizeGV->getValueType(), SizeGV, GepArgs), + InitCallsiteToFunction}); +} + +void CSIImpl::finalizeCsi() { + // Insert __csi_func_id_ weak symbols for all defined functions and + // generate the runtime code that stores to all of them. + generateInitCallsiteToFunction(); + + Function *Ctor = Function::Create( + FunctionType::get(Type::getVoidTy(M.getContext()), false), + GlobalValue::InternalLinkage, CsiRtUnitCtorName, &M); + BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor); + IRBuilder<> IRB(ReturnInst::Create(M.getContext(), CtorBB)); + CallInst *Call = createRTUnitInitCall(IRB); + // TODO: Add version-check to the cunstructor? See + // ModuleUtils::createSanitizerCtorAndInitFunctions for example. + + // Add the ctor to llvm.global_ctors via appendToGlobalCtors() if either + // llvm.global_ctors does not exist or it exists with an initializer. One of + // these two conditions should always hold for modules compiled normally, but + // appendToGlobalCtors can crash if a tool, such as bugpoint, removes the + // initializer from llvm.global_ctors. This change facilitates using bugpoint + // to debug crashes involving CSI. + if (GlobalVariable *GVCtor = M.getNamedGlobal("llvm.global_ctors")) { + if (GVCtor->hasInitializer()) + appendToGlobalCtors(M, Ctor, CsiUnitCtorPriority); + } else { + appendToGlobalCtors(M, Ctor, CsiUnitCtorPriority); + } + + CallGraphNode *CNCtor = CG->getOrInsertFunction(Ctor); + CallGraphNode *CNFunc = + CG->getOrInsertFunction(cast(RTUnitInit.getCallee())); + CNCtor->addCalledFunction(Call, CNFunc); +} + +namespace { +// Custom DiagnosticInfo for linking a tool bitcode file. +class CSILinkDiagnosticInfo : public DiagnosticInfo { + const Module *SrcM; + const Twine &Msg; + +public: + CSILinkDiagnosticInfo(DiagnosticSeverity Severity, const Module *SrcM, + const Twine &Msg) + : DiagnosticInfo(DK_Lowering, Severity), SrcM(SrcM), Msg(Msg) {} + void print(DiagnosticPrinter &DP) const override { + DP << "linking module '" << SrcM->getModuleIdentifier() << "': " << Msg; + } +}; + +// Custom DiagnosticHandler to handle diagnostics arising when linking a tool +// bitcode file. +class CSIDiagnosticHandler final : public DiagnosticHandler { + const Module *SrcM; + DiagnosticHandler *OrigHandler; + +public: + CSIDiagnosticHandler(const Module *SrcM, DiagnosticHandler *OrigHandler) + : SrcM(SrcM), OrigHandler(OrigHandler) {} + + bool handleDiagnostics(const DiagnosticInfo &DI) override { + if (DI.getKind() != DK_Linker) + return OrigHandler->handleDiagnostics(DI); + + std::string MsgStorage; + { + raw_string_ostream Stream(MsgStorage); + DiagnosticPrinterRawOStream DP(Stream); + DI.print(DP); + } + return OrigHandler->handleDiagnostics( + CSILinkDiagnosticInfo(DI.getSeverity(), SrcM, MsgStorage)); + } +}; +} // namespace + +static GlobalVariable *copyGlobalArray(const char *Array, Module &M) { + // Get the current set of static global constructors. + if (GlobalVariable *GVA = M.getNamedGlobal(Array)) { + if (Constant *Init = GVA->getInitializer()) { + // Copy the existing global constructors into a new variable. + GlobalVariable *NGV = new GlobalVariable( + Init->getType(), GVA->isConstant(), GVA->getLinkage(), Init, "", + GVA->getThreadLocalMode()); + GVA->getParent()->insertGlobalVariable(GVA->getIterator(), NGV); + return NGV; + } + } + return nullptr; +} + +// Replace the modified global array list with the copy of the old version. +static void replaceGlobalArray(const char *Array, Module &M, + GlobalVariable *GVACopy) { + // Get the current version of the global array. + GlobalVariable *GVA = M.getNamedGlobal(Array); + GVACopy->takeName(GVA); + + // Nuke the old list, replacing any uses with the new one. + if (!GVA->use_empty()) { + Constant *V = GVACopy; + if (V->getType() != GVA->getType()) + V = ConstantExpr::getBitCast(V, GVA->getType()); + GVA->replaceAllUsesWith(V); + } + GVA->eraseFromParent(); +} + +// Restore the global array to its copy of its previous value. +static void restoreGlobalArray(const char *Array, Module &M, + GlobalVariable *GVACopy, bool GVAModified) { + if (GVACopy) { + if (GVAModified) { + // Replace the new global array with the old copy. + replaceGlobalArray(Array, M, GVACopy); + } else { + // The bitcode file doesn't add to the global array, so just delete the + // copy. + assert(GVACopy->use_empty()); + GVACopy->eraseFromParent(); + } + } else { // No global array was copied. + if (GVAModified) { + // Create a zero-initialized version of the global array. + GlobalVariable *NewGV = M.getNamedGlobal(Array); + ConstantArray *NewCA = cast(NewGV->getInitializer()); + Constant *CARepl = ConstantArray::get( + ArrayType::get(NewCA->getType()->getElementType(), 0), {}); + GlobalVariable *GVRepl = new GlobalVariable( + CARepl->getType(), NewGV->isConstant(), NewGV->getLinkage(), CARepl, + "", NewGV->getThreadLocalMode()); + NewGV->getParent()->insertGlobalVariable(NewGV->getIterator(), GVRepl); + + // Replace the global array with the zero-initialized version. + replaceGlobalArray(Array, M, GVRepl); + } else { + // Nothing to do. + } + } +} + +void CSIImpl::linkInToolFromBitcode(const std::string &BitcodePath) { + if (BitcodePath != "") { + LLVMContext &C = M.getContext(); + LLVM_DEBUG(dbgs() << "Using external bitcode file for CSI: " + << BitcodePath << "\n"); + SMDiagnostic SMD; + + std::unique_ptr ToolModule = parseIRFile(BitcodePath, SMD, C); + if (!ToolModule) { + C.emitError("CSI: Failed to parse bitcode file: " + BitcodePath); + return; + } + + // Get the original DiagnosticHandler for this context. + std::unique_ptr OrigDiagHandler = + C.getDiagnosticHandler(); + + // Setup a CSIDiagnosticHandler for this context, to handle + // diagnostics that arise from linking ToolModule. + C.setDiagnosticHandler(std::make_unique( + ToolModule.get(), OrigDiagHandler.get())); + + // Get list of functions in ToolModule. + for (Function &TF : *ToolModule) + FunctionsInBitcode.insert(std::string(TF.getName())); + + GlobalVariable *GVCtorCopy = copyGlobalArray("llvm.global_ctors", M); + GlobalVariable *GVDtorCopy = copyGlobalArray("llvm.global_dtors", M); + bool BitcodeAddsCtors = false, BitcodeAddsDtors = false; + + // Link the external module into the current module, copying over global + // values. + bool Fail = Linker::linkModules( + M, std::move(ToolModule), Linker::Flags::LinkOnlyNeeded, + [&](Module &M, const StringSet<> &GVS) { + for (StringRef GVName : GVS.keys()) { + LLVM_DEBUG(dbgs() << "Linking global value " << GVName << "\n"); + if (GVName == "llvm.global_ctors") { + BitcodeAddsCtors = true; + continue; + } else if (GVName == "llvm.global_dtors") { + BitcodeAddsDtors = true; + continue; + } + // Record this GlobalValue as linked from the bitcode. + LinkedFromBitcode.insert(M.getNamedValue(GVName)); + if (Function *Fn = M.getFunction(GVName)) { + if (!Fn->isDeclaration() && !Fn->hasComdat()) { + // We set the function's linkage as available_externally, so + // that subsequent optimizations can remove these definitions + // from the module. We don't want this module redefining any of + // these symbols, even if they aren't inlined, because the + // OpenCilk runtime library will provide those definitions + // later. + Fn->setLinkage(Function::AvailableExternallyLinkage); + } + } else if (GlobalVariable *GV = M.getGlobalVariable(GVName)) { + if (!GV->isDeclaration() && !GV->hasComdat()) { + GV->setLinkage(Function::AvailableExternallyLinkage); + } + } + } + }); + if (Fail) + C.emitError("CSI: Failed to link bitcode file: " + Twine(BitcodePath)); + + // Restore the original DiagnosticHandler for this context. + C.setDiagnosticHandler(std::move(OrigDiagHandler)); + + restoreGlobalArray("llvm.global_ctors", M, GVCtorCopy, BitcodeAddsCtors); + restoreGlobalArray("llvm.global_dtors", M, GVDtorCopy, BitcodeAddsDtors); + + LinkedBitcode = true; + } +} + +void CSIImpl::loadConfiguration() { + if (ClConfigurationFilename != "") + Config = InstrumentationConfig::ReadFromConfigurationFile( + ClConfigurationFilename); + else + Config = InstrumentationConfig::GetDefault(); + + Config->SetConfigMode(ClConfigurationMode); +} + +Value *CSIImpl::lookupUnderlyingObject(Value *Addr) const { + return getUnderlyingObject(Addr, 0); + // if (!UnderlyingObject.count(Addr)) + // UnderlyingObject[Addr] = getUnderlyingObject(Addr, 0); + + // return UnderlyingObject[Addr]; +} + +bool CSIImpl::shouldNotInstrumentFunction(Function &F) { + Module &M = *F.getParent(); + // Don't instrument standard library calls. +#ifdef WIN32 + if (F.hasName() && F.getName().find("_") == 0) { + return true; + } +#endif + + if (F.hasName() && F.getName().find("__csi") != std::string::npos) + return true; + + // Never instrument the CSI ctor. + if (F.hasName() && F.getName() == CsiRtUnitCtorName) + return true; + + // Don't instrument anything in the startup section or the __StaticInit + // section (MacOSX). + if (F.getSection() == ".text.startup" || + F.getSection().find("__StaticInit") != std::string::npos) + return true; + + // Don't instrument functions that will run before or + // simultaneously with CSI ctors. + GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); + if (GV == nullptr) + return false; + if (!GV->hasInitializer() || GV->getInitializer()->isNullValue()) + return false; + + ConstantArray *CA = cast(GV->getInitializer()); + for (Use &OP : CA->operands()) { + if (isa(OP)) + continue; + ConstantStruct *CS = cast(OP); + + if (Function *CF = dyn_cast(CS->getOperand(1))) { + uint64_t Priority = + dyn_cast(CS->getOperand(0))->getLimitedValue(); + if (Priority <= CsiUnitCtorPriority && CF->getName() == F.getName()) { + // Do not instrument F. + return true; + } + } + } + // false means do instrument it. + return false; +} + +bool CSIImpl::isVtableAccess(const Instruction *I) { + if (const MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) + return Tag->isTBAAVtableAccess(); + return false; +} + +bool CSIImpl::addrPointsToConstantData(const Value *Addr) { + // If this is a GEP, just analyze its pointer operand. + if (const GetElementPtrInst *GEP = dyn_cast(Addr)) + Addr = GEP->getPointerOperand(); + + if (const GlobalVariable *GV = dyn_cast(Addr)) { + if (GV->isConstant()) { + return true; + } + } else if (const LoadInst *L = dyn_cast(Addr)) { + if (isVtableAccess(L)) { + return true; + } + } + return false; +} + +bool CSIImpl::isAtomic(const Instruction *I) { + if (const LoadInst *LI = dyn_cast(I)) + return LI->isAtomic() && LI->getSyncScopeID() != SyncScope::SingleThread; + if (const StoreInst *SI = dyn_cast(I)) + return SI->isAtomic() && SI->getSyncScopeID() != SyncScope::SingleThread; + if (isa(I)) + return true; + if (isa(I)) + return true; + if (isa(I)) + return true; + return false; +} + +bool CSIImpl::isThreadLocalObject(const Value *Obj) { + if (const IntrinsicInst *II = dyn_cast(Obj)) + return Intrinsic::threadlocal_address == II->getIntrinsicID(); + if (const GlobalValue *GV = dyn_cast(Obj)) + return GV->isThreadLocal(); + return false; +} + +void CSIImpl::computeLoadAndStoreProperties( + SmallVectorImpl> + &LoadAndStoreProperties, + SmallVectorImpl &BBLoadsAndStores) { + SmallSet WriteTargets; + + for (SmallVectorImpl::reverse_iterator + It = BBLoadsAndStores.rbegin(), + E = BBLoadsAndStores.rend(); + It != E; ++It) { + Instruction *I = *It; + if (StoreInst *Store = dyn_cast(I)) { + Value *Addr = Store->getPointerOperand(); + WriteTargets.insert(Addr); + CsiLoadStoreProperty Prop; + // Update alignment property data + Prop.setAlignment(MaybeAlign(Store->getAlign())); + // Set vtable-access property + Prop.setIsVtableAccess(isVtableAccess(Store)); + // Set constant-data-access property + Prop.setIsConstant(addrPointsToConstantData(Addr)); + Value *Obj = lookupUnderlyingObject(Addr); + // Set is-on-stack property + Prop.setIsOnStack(isa(Obj)); + // Set may-be-captured property + Prop.setMayBeCaptured(isa(Obj) || + PointerMayBeCaptured(Addr, true, true)); + // Set is-thread-local property + Prop.setIsThreadLocal(isThreadLocalObject(Obj)); + LoadAndStoreProperties.push_back(std::make_pair(I, Prop)); + } else { + LoadInst *Load = cast(I); + Value *Addr = Load->getPointerOperand(); + CsiLoadStoreProperty Prop; + // Update alignment property data + Prop.setAlignment(MaybeAlign(Load->getAlign())); + // Set vtable-access property + Prop.setIsVtableAccess(isVtableAccess(Load)); + // Set constant-data-access-property + Prop.setIsConstant(addrPointsToConstantData(Addr)); + Value *Obj = lookupUnderlyingObject(Addr); + // Set is-on-stack property + Prop.setIsOnStack(isa(Obj)); + // Set may-be-captured property + Prop.setMayBeCaptured(isa(Obj) || + PointerMayBeCaptured(Addr, true, true)); + // Set is-thread-local property + Prop.setIsThreadLocal(isThreadLocalObject(Obj)); + // Set load-read-before-write-in-bb property + bool HasBeenSeen = WriteTargets.count(Addr) > 0; + Prop.setLoadReadBeforeWriteInBB(HasBeenSeen); + LoadAndStoreProperties.push_back(std::make_pair(I, Prop)); + } + } + BBLoadsAndStores.clear(); +} + +// Update the attributes on the instrumented function that might be invalidated +// by the inserted instrumentation. +void CSIImpl::updateInstrumentedFnAttrs(Function &F) { + F.removeFnAttr(Attribute::ReadOnly); + F.removeFnAttr(Attribute::ReadNone); + MemoryEffects CurrentME = F.getMemoryEffects(); + if (MemoryEffects::unknown() != CurrentME) { + F.setMemoryEffects( + CurrentME | + MemoryEffects(MemoryEffects::Location::Other, ModRefInfo::ModRef) | + MemoryEffects(MemoryEffects::Location::InaccessibleMem, + ModRefInfo::ModRef)); + } +} + +// Return true if BB is an entry block to a function or task, false otherwise. +bool CSIImpl::isEntryBlock(const BasicBlock &BB, const TaskInfo &TI) { + return &BB == TI.getTaskFor(&BB)->getEntry(); +} + +// Check whether function-entry instrumentation can be inserted after +// instruction \p I. +static bool skipInstructionInEntryBB(const Instruction &I) { + if (isa(I)) + return true; + + if (isa(I)) + return true; + + if (const IntrinsicInst *II = dyn_cast(&I)) { + // Skip simple intrinsics + switch(II->getIntrinsicID()) { + case Intrinsic::annotation: + case Intrinsic::assume: + case Intrinsic::sideeffect: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: + case Intrinsic::is_constant: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::objectsize: + case Intrinsic::ptr_annotation: + case Intrinsic::var_annotation: + case Intrinsic::experimental_gc_result: + case Intrinsic::experimental_gc_relocate: + case Intrinsic::experimental_noalias_scope_decl: + case Intrinsic::syncregion_start: + case Intrinsic::taskframe_create: + case Intrinsic::taskframe_use: + return true; + default: + return false; + } + } + + return false; +} + +// Scan the entry basic block \p BB to find the first point to insert +// instrumentation. +Instruction *CSIImpl::getEntryBBInsertPt(BasicBlock &BB) { + // If a previous insertion point was already found for this entry block, + // return it. + if (EntryBBInsertPt.count(&BB)) + return EntryBBInsertPt[&BB]; + + BasicBlock::iterator BI(BB.getFirstInsertionPt()); + BasicBlock::const_iterator BE(BB.end()); + + // Scan the basic block for the first instruction we should not skip. + while (BI != BE) { + if (!skipInstructionInEntryBB(*BI)) { + EntryBBInsertPt.insert(std::make_pair(&BB, &*BI)); + return &*BI; + } + ++BI; + } + + // We reached the end of the basic block; return the terminator. + EntryBBInsertPt.insert(std::make_pair(&BB, BB.getTerminator())); + return BB.getTerminator(); +} + +void CSIImpl::instrumentFunction(Function &F) { + // This is required to prevent instrumenting the call to + // __csi_module_init from within the module constructor. + + if (F.empty() || shouldNotInstrumentFunction(F) || + LinkedFromBitcode.count(&F)) + return; + + if (Options.CallsMayThrow) + // Promote calls to invokes to insert CSI instrumentation in + // exception-handling code. + setupCalls(F); + + const TargetLibraryInfo *TLI = &GetTLI(F); + + DominatorTree *DT = &GetDomTree(F); + LoopInfo &LI = GetLoopInfo(F); + + // If we do not assume that calls terminate blocks, or if we're not + // instrumenting basic blocks, then we're done. + if (Options.InstrumentBasicBlocks && Options.CallsTerminateBlocks) + splitBlocksAtCalls(F, DT, &LI); + + if (Options.InstrumentLoops) + // Simplify loops to prepare for loop instrumentation + for (Loop *L : LI) + simplifyLoop(L, DT, &LI, nullptr, nullptr, nullptr, + /* PreserveLCSSA */ false); + + // Canonicalize the CFG for CSI instrumentation + setupBlocks(F, TLI, DT, &LI); + + LLVM_DEBUG(dbgs() << "Canonicalized function:\n" << F); + + SmallVector, 8> + LoadAndStoreProperties; + SmallVector AllocationFnCalls; + SmallVector FreeCalls; + SmallVector MemIntrinsics; + SmallVector Callsites; + SmallVector BasicBlocks; + SmallVector AtomicAccesses; + SmallVector Detaches; + SmallVector Syncs; + SmallVector Allocas; + SmallVector AllCalls; + bool MaySpawn = false; + SmallPtrSet BBsToIgnore; + + DenseMap SRCounters; + DenseMap SyncRegNums; + + TaskInfo &TI = GetTaskInfo(F); + ScalarEvolution *SE = nullptr; + if (GetScalarEvolution) + SE = &(*GetScalarEvolution)(F); + + // Compile lists of all instrumentation points before anything is modified. + for (BasicBlock &BB : F) { + // Ignore Tapir placeholder basic blocks + if (&F.getEntryBlock() != &BB && isTapirPlaceholderSuccessor(&BB)) + continue; + if (!DT->isReachableFromEntry(&BB)) + continue; + SmallVector BBLoadsAndStores; + for (Instruction &I : BB) { + if (isAtomic(&I)) + AtomicAccesses.push_back(&I); + else if (isa(I) || isa(I)) { + BBLoadsAndStores.push_back(&I); + } else if (DetachInst *DI = dyn_cast(&I)) { + MaySpawn = true; + Detaches.push_back(DI); + } else if (SyncInst *SI = dyn_cast(&I)) { + Syncs.push_back(SI); + if (isSyncUnwind(SI->getSuccessor(0)->getFirstNonPHIOrDbgOrLifetime(), + /*SyncRegion=*/nullptr, /*CheckForInvoke=*/true)) { + SyncsWithUnwinds.insert(SI); + BBsToIgnore.insert(SI->getSuccessor(0)); + } + } else if (CallBase *CB = dyn_cast(&I)) { + if (const IntrinsicInst *II = dyn_cast(CB)) { + if (Intrinsic::syncregion_start == II->getIntrinsicID()) { + // Identify this sync region with a counter value, where all sync + // regions within a function or task are numbered from 0. + if (TI.getTaskFor(&BB)) { + BasicBlock *TEntry = TI.getTaskFor(&BB)->getEntry(); + // Create a new counter if need be. + if (!SRCounters.count(TEntry)) + SRCounters[TEntry] = 0; + SyncRegNums[&I] = SRCounters[TEntry]++; + } + } + } + + // Record this function call as either an allocation function, a call to + // free (or delete), a memory intrinsic, or an ordinary real function + // call. + if (isAllocFn(&I, TLI)) + AllocationFnCalls.push_back(&I); + else if (isFreeFn(CB, TLI)) + FreeCalls.push_back(&I); + else if (isa(I)) + MemIntrinsics.push_back(&I); + else if (!callsPlaceholderFunction(I)) + Callsites.push_back(&I); + + AllCalls.push_back(&I); + + computeLoadAndStoreProperties(LoadAndStoreProperties, BBLoadsAndStores); + } else if (isa(I)) { + Allocas.push_back(&I); + } + } + computeLoadAndStoreProperties(LoadAndStoreProperties, BBLoadsAndStores); + if (!BBsToIgnore.count(&BB)) + BasicBlocks.push_back(&BB); + } + + uint64_t LocalId = getLocalFunctionID(F); + IRBuilder<> IRB(getEntryBBInsertPt(F.getEntryBlock())); + Value *FuncId = FunctionFED.localToGlobalId(LocalId, IRB); + + // Instrument basic blocks. Note that we do this before other instrumentation + // so that we put this at the beginning of the basic block, and then the + // function entry call goes before the call to basic block entry. + if (Options.InstrumentBasicBlocks) + for (BasicBlock *BB : BasicBlocks) + instrumentBasicBlock(*BB, TI); + + // Instrument Tapir constructs. + if (Options.InstrumentTapir) { + if (Config->DoesFunctionRequireInstrumentationForPoint( + F.getName(), InstrumentationPoint::INSTR_TAPIR_DETACH)) { + for (DetachInst *DI : Detaches) + instrumentDetach(DI, SyncRegNums[DI->getSyncRegion()], + SRCounters[DI->getDetached()], DT, TI, LI); + } + if (Config->DoesFunctionRequireInstrumentationForPoint( + F.getName(), InstrumentationPoint::INSTR_TAPIR_SYNC)) { + for (SyncInst *SI : Syncs) + instrumentSync(SI, SyncRegNums[SI->getSyncRegion()]); + } + } + + // Instrument allocas early, because they may require instrumentation inserted + // at an unusual place. + if (Options.InstrumentAllocas) + for (Instruction *I : Allocas) + instrumentAlloca(I, TI); + + if (Options.InstrumentLoops) + // Recursively instrument all loops + for (Loop *L : LI) + instrumentLoop(*L, TI, SE); + + // Do this work in a separate loop after copying the iterators so that we + // aren't modifying the list as we're iterating. + if (Options.InstrumentMemoryAccesses) + for (std::pair p : + LoadAndStoreProperties) + instrumentLoadOrStore(p.first, p.second); + + // Instrument atomic memory accesses in any case (they can be used to + // implement synchronization). + if (Options.InstrumentAtomics) + for (Instruction *I : AtomicAccesses) + instrumentAtomic(I); + + if (Options.InstrumentMemIntrinsics) + for (Instruction *I : MemIntrinsics) + instrumentMemIntrinsic(I); + + if (Options.InstrumentCalls) + for (Instruction *I : Callsites) + instrumentCallsite(I, DT); + + if (Options.InstrumentAllocFns) { + for (Instruction *I : AllocationFnCalls) + instrumentAllocFn(I, DT, TLI); + for (Instruction *I : FreeCalls) + instrumentFree(I, TLI); + } + + if (Options.Interpose && Config->DoesAnyFunctionRequireInterposition()) { + for (Instruction *I : AllCalls) + interposeCall(I); + } + + // Instrument function entry/exit points. + if (Options.InstrumentFuncEntryExit) { + IRBuilder<> IRB(cast(FuncId)->getNextNode()); + if (Config->DoesFunctionRequireInstrumentationForPoint( + F.getName(), InstrumentationPoint::INSTR_FUNCTION_ENTRY)) { + CsiFuncProperty FuncEntryProp; + FuncEntryProp.setMaySpawn(MaySpawn); + if (MaySpawn) + FuncEntryProp.setNumSyncReg(SRCounters[TI.getRootTask()->getEntry()]); + Value *PropVal = FuncEntryProp.getValue(IRB); + insertHookCall(&*IRB.GetInsertPoint(), CsiFuncEntry, {FuncId, PropVal}); + } + if (Config->DoesFunctionRequireInstrumentationForPoint( + F.getName(), InstrumentationPoint::INSTR_FUNCTION_EXIT)) { + EscapeEnumerator EE(F, "csi.cleanup", false); + while (IRBuilder<> *AtExit = EE.Next()) { + uint64_t ExitLocalId = FunctionExitFED.add(*AtExit->GetInsertPoint()); + Value *ExitCsiId = + FunctionExitFED.localToGlobalId(ExitLocalId, *AtExit); + CsiFuncExitProperty FuncExitProp; + FuncExitProp.setMaySpawn(MaySpawn); + FuncExitProp.setEHReturn(isa(AtExit->GetInsertPoint())); + Value *PropVal = FuncExitProp.getValue(*AtExit); + insertHookCall(&*AtExit->GetInsertPoint(), CsiFuncExit, + {ExitCsiId, FuncId, PropVal}); + } + } + } + + updateInstrumentedFnAttrs(F); +} + +Function *CSIImpl::getInterpositionFunction(Function *F) { + if (InterpositionFunctions.find(F) != InterpositionFunctions.end()) + return InterpositionFunctions.lookup(F); + + std::string InterposedName = "__csi_interpose_" + F->getName().str(); + Function *InterpositionFunction = cast( + M.getOrInsertFunction(InterposedName, F->getFunctionType()).getCallee()); + + InterpositionFunctions.insert({F, InterpositionFunction}); + + return InterpositionFunction; +} + +void ComprehensiveStaticInstrumentationLegacyPass::getAnalysisUsage( + AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); +} + +bool ComprehensiveStaticInstrumentationLegacyPass::runOnModule(Module &M) { + if (skipModule(M)) + return false; + + CallGraph *CG = &getAnalysis().getCallGraph(); + auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis().getTLI(F); + }; + auto GetDomTree = [this](Function &F) -> DominatorTree & { + return this->getAnalysis(F).getDomTree(); + }; + auto GetLoopInfo = [this](Function &F) -> LoopInfo & { + return this->getAnalysis(F).getLoopInfo(); + }; + auto GetTTI = [this](Function &F) -> TargetTransformInfo & { + return this->getAnalysis().getTTI(F); + }; + auto GetSE = [this](Function &F) -> ScalarEvolution & { + return this->getAnalysis(F).getSE(); + }; + auto GetTaskInfo = [this](Function &F) -> TaskInfo & { + return this->getAnalysis(F).getTaskInfo(); + }; + + bool res = CSIImpl(M, CG, GetDomTree, GetLoopInfo, GetTaskInfo, GetTLI, GetSE, + GetTTI, Options) + .run(); + + verifyModule(M, &llvm::errs()); + + numPassRuns++; + + return res; +} + +CSISetupPass::CSISetupPass() : Options(OverrideFromCL(CSIOptions())) {} + +CSISetupPass::CSISetupPass(const CSIOptions &Options) : Options(Options) {} + +PreservedAnalyses CSISetupPass::run(Module &M, ModuleAnalysisManager &AM) { + if (!CSISetupImpl(M, Options).run()) + return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} + +ComprehensiveStaticInstrumentationPass::ComprehensiveStaticInstrumentationPass() + : Options(OverrideFromCL(CSIOptions())) {} + +ComprehensiveStaticInstrumentationPass::ComprehensiveStaticInstrumentationPass( + const CSIOptions &Options) + : Options(Options) {} + +PreservedAnalyses +ComprehensiveStaticInstrumentationPass::run(Module &M, + ModuleAnalysisManager &AM) { + auto &FAM = AM.getResult(M).getManager(); + + auto &CG = AM.getResult(M); + auto GetDT = [&FAM](Function &F) -> DominatorTree & { + return FAM.getResult(F); + }; + auto GetLI = [&FAM](Function &F) -> LoopInfo & { + return FAM.getResult(F); + }; + auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { + return FAM.getResult(F); + }; + auto GetSE = [&FAM](Function &F) -> ScalarEvolution & { + return FAM.getResult(F); + }; + auto GetTI = [&FAM](Function &F) -> TaskInfo & { + return FAM.getResult(F); + }; + auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult(F); + }; + + // Disable additional conversion of calls to invokes. + Options.CallsMayThrow = false; + + if (!CSIImpl(M, &CG, GetDT, GetLI, GetTI, GetTLI, GetSE, GetTTI, Options) + .run()) + return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index c7f6f2a43c17f59..4308d470cd3221b 100644 --- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -734,9 +734,12 @@ static BasicBlock *getInstrBB(CFGMST &MST, Edge &E, // Some IndirectBr critical edges cannot be split by the previous // SplitIndirectBrCriticalEdges call. Bail out. + // Similarly bail out due to critical edges that cannot be split after detach + // instructions. const unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB); - BasicBlock *InstrBB = - isa(TI) ? nullptr : SplitCriticalEdge(TI, SuccNum); + BasicBlock *InstrBB = (isa(TI) || isa(TI)) + ? nullptr + : SplitCriticalEdge(TI, SuccNum); if (!InstrBB) return nullptr; diff --git a/llvm/lib/Transforms/Instrumentation/SurgicalInstrumentationConfig.cpp b/llvm/lib/Transforms/Instrumentation/SurgicalInstrumentationConfig.cpp new file mode 100644 index 000000000000000..67857f26c8ec31f --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/SurgicalInstrumentationConfig.cpp @@ -0,0 +1,109 @@ +//===-- SurgicalInstrumentationConfig.cpp -- Surgical CSI -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is part of CSI, a framework that provides comprehensive static +// instrumentation. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation/SurgicalInstrumentationConfig.h" + +namespace llvm { +InstrumentationPoint +ParseInstrumentationPoint(const StringRef &instrPointString) { + if (SurgicalInstrumentationPoints.find(instrPointString) == + SurgicalInstrumentationPoints.end()) { + return InstrumentationPoint::INSTR_INVALID_POINT; + } else + return SurgicalInstrumentationPoints[instrPointString]; +} + +std::unique_ptr +llvm::InstrumentationConfig::GetDefault() { + return std::unique_ptr( + new DefaultInstrumentationConfig()); +} + +std::unique_ptr +InstrumentationConfig::ReadFromConfigurationFile(const std::string &filename) { + auto file = MemoryBuffer::getFile(filename); + + if (!file) { + llvm::report_fatal_error( + Twine("Instrumentation configuration file could not be opened: ") + + Twine(file.getError().message())); + } + + StringRef contents = file.get()->getBuffer(); + SmallVector lines; + + contents.split(lines, '\n', -1, false); + + StringMap functions; + StringSet<> interposedFunctions; + + bool interposeMode = false; + + // One instruction per line. + for (auto &line : lines) { + auto trimmedLine = line.trim(); + if (trimmedLine.size() == 0 || + trimmedLine[0] == '#') // Skip comments or empty lines. + continue; + + if (trimmedLine == "INTERPOSE") { + interposeMode = true; + continue; + } else if (trimmedLine == "INSTRUMENT") { + interposeMode = false; + continue; + } + + SmallVector tokens; + trimmedLine.split(tokens, ',', -1, false); + + if (interposeMode) { + interposedFunctions.insert(tokens[0]); + } else { + if (tokens.size() > 0) { + InstrumentationPoint points = InstrumentationPoint::INSTR_INVALID_POINT; + if (tokens.size() > + 1) // This function specifies specific instrumentation points. + { + for (size_t i = 1; i < tokens.size(); ++i) { + auto instrPoint = ParseInstrumentationPoint(tokens[i].trim()); + + points |= instrPoint; + } + } + + auto trimmed = tokens[0].trim(); + if (trimmed != "") + functions[trimmed] = points; + } + } + } + + // If the configuration file turned out to be empty, + // instrument everything. + if (functions.size() == 0 && interposedFunctions.size() == 0) + return GetDefault(); + + for (auto &function : functions) { + if (interposedFunctions.find(function.getKey()) != interposedFunctions.end()) { + llvm::errs() << "warning: function for which interpositioning was " + "requested is also listed for instrumentation. The " + "function will only be interposed"; + } + } + + return std::unique_ptr( + new InstrumentationConfig(functions, interposedFunctions)); +} + +} // namespace llvm diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 92e533d2281af88..d02e8440e5a1c29 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -46,6 +46,7 @@ #include "llvm/Transforms/Utils/EscapeEnumerator.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/TapirUtils.h" using namespace llvm; @@ -573,6 +574,9 @@ bool ThreadSanitizer::sanitizeFunction(Function &F, IRB.getInt32(0)); IRB.CreateCall(TsanFuncEntry, ReturnAddress); + if (ClHandleCxxExceptions && !F.doesNotThrow()) + promoteCallsInTasksToInvokes(F, "tsan_cleanup"); + EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions); while (IRBuilder<> *AtExit = EE.Next()) { InstrumentationIRBuilder::ensureDebugInfo(*AtExit, F); diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index db39d8621d07714..8eb3fbafa6dd50c 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -41,6 +41,7 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/PHITransAddr.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Attributes.h" @@ -73,6 +74,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/VNCoercion.h" #include #include @@ -825,9 +827,10 @@ PreservedAnalyses GVNPass::run(Function &F, FunctionAnalysisManager &AM) { auto *MemDep = isMemDepEnabled() ? &AM.getResult(F) : nullptr; auto &LI = AM.getResult(F); + auto &TI = AM.getResult(F); auto *MSSA = AM.getCachedResult(F); auto &ORE = AM.getResult(F); - bool Changed = runImpl(F, AC, DT, TLI, AA, MemDep, LI, &ORE, + bool Changed = runImpl(F, AC, DT, TLI, AA, MemDep, LI, &ORE, TI, MSSA ? &MSSA->getMSSA() : nullptr); if (!Changed) return PreservedAnalyses::all(); @@ -837,6 +840,7 @@ PreservedAnalyses GVNPass::run(Function &F, FunctionAnalysisManager &AM) { if (MSSA) PA.preserve(); PA.preserve(); + PA.preserve(); return PA; } @@ -1642,8 +1646,12 @@ bool GVNPass::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks)) { continue; } + if (isa(Pred->getTerminator())) { + continue; + } - if (Pred->getTerminator()->getNumSuccessors() != 1) { + if (Pred->getTerminator()->getNumSuccessors() != 1 && + !isa(Pred->getTerminator())) { if (isa(Pred->getTerminator())) { LLVM_DEBUG( dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '" @@ -1928,6 +1936,21 @@ bool GVNPass::processNonLocalLoad(LoadInst *Load) { } bool Changed = false; + + // If we depend on a detach instruction, reject. + for (unsigned i = 0, e = NumDeps; i != e; ++i) { + MemDepResult DepInfo = Deps[i].getResult(); + if (!(DepInfo.getInst())) + continue; + if (isa(DepInfo.getInst()) || + isa(DepInfo.getInst())) { + LLVM_DEBUG(dbgs() << "GVN: Cannot process " << *Load + << " due to dependency on" << *(DepInfo.getInst()) + << "\n"); + return Changed; + } + } + // If this load follows a GEP, see if we can PRE the indices before analyzing. if (GetElementPtrInst *GEP = dyn_cast(Load->getOperand(0))) { @@ -2758,7 +2781,8 @@ bool GVNPass::processInstruction(Instruction *I) { bool GVNPass::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, const TargetLibraryInfo &RunTLI, AAResults &RunAA, MemoryDependenceResults *RunMD, LoopInfo &LI, - OptimizationRemarkEmitter *RunORE, MemorySSA *MSSA) { + OptimizationRemarkEmitter *RunORE, TaskInfo *TI, + MemorySSA *MSSA) { AC = &RunAC; DT = &RunDT; VN.setDomTree(DT); @@ -2819,6 +2843,12 @@ bool GVNPass::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, // iteration. DeadBlocks.clear(); + if (TI && Changed) + // Recompute task info. + // FIXME: Figure out a way to update task info that is less computationally + // wasteful. + TI->recalculate(F, *DT); + if (MSSA && VerifyMemorySSA) MSSA->verifyMemorySSA(); @@ -2982,6 +3012,8 @@ bool GVNPass::performScalarPRE(Instruction *CurInst) { if (InvalidBlockRPONumbers) assignBlockRPONumber(*CurrentBlock->getParent()); + SmallVector, 8> Reattaches; + SmallVector, 8> Detaches; SmallVector, 8> predMap; for (BasicBlock *P : predecessors(CurrentBlock)) { // We're not interested in PRE where blocks with predecessors that are @@ -3001,15 +3033,27 @@ bool GVNPass::performScalarPRE(Instruction *CurInst) { uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this); Value *predV = findLeader(P, TValNo); if (!predV) { - predMap.push_back(std::make_pair(static_cast(nullptr), P)); - PREPred = P; - ++NumWithout; + if (!isa(P->getTerminator())) { + predMap.push_back(std::make_pair(static_cast(nullptr), P)); + PREPred = P; + ++NumWithout; + } + // Record any detach and reattach predecessors. + if (DetachInst *DI = dyn_cast(P->getTerminator())) + Detaches.push_back(std::make_pair(static_cast(nullptr), DI)); + if (ReattachInst *RI = dyn_cast(P->getTerminator())) + Reattaches.push_back(std::make_pair(static_cast(nullptr), RI)); } else if (predV == CurInst) { /* CurInst dominates this predecessor. */ NumWithout = 2; break; } else { predMap.push_back(std::make_pair(predV, P)); + // Record any detach and reattach predecessors. + if (DetachInst *DI = dyn_cast(P->getTerminator())) + Detaches.push_back(std::make_pair(predV, DI)); + if (ReattachInst *RI = dyn_cast(P->getTerminator())) + Reattaches.push_back(std::make_pair(predV, RI)); ++NumWith; } } @@ -3019,6 +3063,23 @@ bool GVNPass::performScalarPRE(Instruction *CurInst) { if (NumWithout > 1 || NumWith == 0) return false; + for (auto RV : Reattaches) { + ReattachInst *RI = RV.second; + bool DetachFound = false; + for (auto DV : Detaches) { + DetachInst *DI = DV.second; + // Get the detach edge from DI. + BasicBlockEdge DetachEdge(DI->getParent(), DI->getDetached()); + if (DT->dominates(DetachEdge, RI->getParent())) { + DetachFound = true; + if (RV.first && (RV.first != DV.first)) + return false; + } + } + assert(DetachFound && + "Reattach predecessor found with no detach predecessor"); + } + // We may have a case where all predecessors have the instruction, // and we just need to insert a phi node. Otherwise, perform // insertion. @@ -3042,7 +3103,8 @@ bool GVNPass::performScalarPRE(Instruction *CurInst) { // the edge to be split and perform the PRE the next time we iterate // on the function. unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock); - if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) { + if (isCriticalEdge(PREPred->getTerminator(), SuccNum) && + !isa(PREPred->getTerminator())) { toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum)); return false; } @@ -3055,6 +3117,22 @@ bool GVNPass::performScalarPRE(Instruction *CurInst) { #endif PREInstr->deleteValue(); return false; + } else if (isa(PREPred->getTerminator())) { + for (auto RV : Reattaches) { + ReattachInst *RI = RV.second; + for (auto DV : Detaches) { + DetachInst *DI = DV.second; + // Get the detach edge from DI. + BasicBlockEdge DetachEdge(DI->getParent(), DI->getDetached()); + if (DT->dominates(DetachEdge, RI->getParent())) { + if (DI->getParent() == PREPred) { + assert(nullptr == DV.first && + "Detach predecessor already had a value."); + predMap.push_back(std::make_pair(PREInstr, RI->getParent())); + } + } + } + } } } @@ -3265,6 +3343,12 @@ void GVNPass::addDeadBlock(BasicBlock *BB) { if (llvm::is_contained(successors(P), B) && isCriticalEdge(P->getTerminator(), B)) { + + // Don't bother splitting critical edges to a detach-continue block, + // since both the detach and reattach predecessors must be dead. + if (isDetachContinueEdge(P->getTerminator(), B)) + continue; + if (BasicBlock *S = splitCriticalEdges(P, B)) DeadBlocks.insert(P = S); } @@ -3346,6 +3430,9 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass { if (skipFunction(F)) return false; + auto *LIWP = getAnalysisIfAvailable(); + + auto *TIWP = getAnalysisIfAvailable(); auto *MSSAWP = getAnalysisIfAvailable(); return Impl.runImpl( F, getAnalysis().getAssumptionCache(F), @@ -3357,6 +3444,7 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass { : nullptr, getAnalysis().getLoopInfo(), &getAnalysis().getORE(), + TIWP ? &TIWP->getTaskInfo() : nullptr, MSSAWP ? &MSSAWP->getMSSA() : nullptr); } @@ -3373,6 +3461,7 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass { AU.addPreserved(); AU.addPreserved(); AU.addRequired(); + AU.addPreserved(); AU.addPreserved(); } @@ -3387,6 +3476,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp index b5333c532280ca3..7f7469af9ac7468 100644 --- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp +++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp @@ -47,6 +47,7 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 5e2131b0b180755..5d5635958736c12 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -38,6 +38,7 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -76,6 +77,7 @@ #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include #include #include @@ -134,9 +136,11 @@ class IndVarSimplify { TargetLibraryInfo *TLI; const TargetTransformInfo *TTI; std::unique_ptr MSSAU; + TaskInfo *TI; SmallVector DeadInsts; bool WidenIndVars; + bool TapirLoopsOnly; bool RunUnswitching = false; @@ -165,9 +169,10 @@ class IndVarSimplify { public: IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, const DataLayout &DL, TargetLibraryInfo *TLI, - TargetTransformInfo *TTI, MemorySSA *MSSA, bool WidenIndVars) - : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI), - WidenIndVars(WidenIndVars) { + TargetTransformInfo *TTI, MemorySSA *MSSA, TaskInfo *TI, + bool WidenIndVars, bool TapirLoopsOnly) + : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI), TI(TI), + WidenIndVars(WidenIndVars), TapirLoopsOnly(TapirLoopsOnly) { if (MSSA) MSSAU = std::make_unique(MSSA); } @@ -710,9 +715,35 @@ static bool isLoopExitTestBasedOn(Value *V, BasicBlock *ExitingBB) { return ICmp->getOperand(0) == V || ICmp->getOperand(1) == V; } +/// Helper method to check if the given IV has the widest induction type. +static bool isWidestInductionType(Loop *L, PHINode *SimpleIV) { + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + uint64_t IVWidth = SimpleIV->getType()->getPrimitiveSizeInBits(); + for (BasicBlock::iterator I = L->getHeader()->begin(); isa(I); ++I) { + PHINode *Phi = cast(I); + if (Phi == SimpleIV) + continue; + + // Skip PHI nodes that are not of integer type. + if (!Phi->getType()->isIntegerTy()) + continue; + + // Skip PHI nodes that are not loop counters. + int Idx = Phi->getBasicBlockIndex(L->getLoopLatch()); + if (Idx < 0) + continue; + + // Check if Phi has a larger valid width than SimpleIV. + uint64_t PhiWidth = Phi->getType()->getPrimitiveSizeInBits(); + if (IVWidth < PhiWidth && DL.isLegalInteger(PhiWidth)) + return false; + } + return true; +} + /// linearFunctionTestReplace policy. Return true unless we can show that the /// current exit test is already sufficiently canonical. -static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) { +static bool needsLFTR(Loop *L, BasicBlock *ExitingBB, TaskInfo *TI) { assert(L->getLoopLatch() && "Must be in simplified form"); // Avoid converting a constant or loop invariant test back to a runtime @@ -756,7 +787,24 @@ static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) { // Do LFTR if the exit condition's IV is *not* a simple counter. Value *IncV = Phi->getIncomingValue(Idx); - return Phi != getLoopPhiForCounter(IncV, L); + if (Phi != getLoopPhiForCounter(IncV, L)) + return true; + + // Tapir loops are particularly picky about having canonical induction + // variables, so check if LFTR needs to create one. + if (getTaskIfTapirLoop(L, TI)) { + // Check that the simple IV has the widest induction type. + if (!isWidestInductionType(L, Phi)) + return true; + + // Check that the simple IV starts at 0. + if (BasicBlock *Preheader = L->getLoopPreheader()) + if (Constant *Start = + dyn_cast(Phi->getIncomingValueForBlock(Preheader))) + return !(Start->isZeroValue()); + } + + return false; } /// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils @@ -1891,6 +1939,31 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { return Changed; } +static bool ensureZeroStartIV(Loop *L, const DataLayout &DL, + ScalarEvolution *SE, DominatorTree *DT) { + BasicBlock *LatchBlock = L->getLoopLatch(); + + const SCEV *ExitCount = SE->getExitCount(L, LatchBlock); + if (isa(ExitCount)) + return false; + + PHINode *IndVar = FindLoopCounter(L, LatchBlock, ExitCount, SE, DT); + if (!IndVar) + return false; + + Instruction * const IncVar = + cast(IndVar->getIncomingValueForBlock(LatchBlock)); + + const SCEVAddRecExpr *AR = cast(SE->getSCEV(IncVar)); + + if (!AR->getStart()->isZero()) { + SCEVExpander ARRewriter(*SE, DL, "indvars"); + ARRewriter.expandCodeFor(AR, AR->getType(), + &L->getHeader()->front()); + } + return true; +} + //===----------------------------------------------------------------------===// // IndVarSimplify driver. Manage several subpasses of IV simplification. //===----------------------------------------------------------------------===// @@ -1912,11 +1985,19 @@ bool IndVarSimplify::run(Loop *L) { if (!L->isLoopSimplifyForm()) return false; + bool IsTapirLoop = (nullptr != getTaskIfTapirLoop(L, TI)); + if (TapirLoopsOnly && !IsTapirLoop) + return false; bool Changed = false; // If there are any floating-point recurrences, attempt to // transform them to use integer recurrences. Changed |= rewriteNonIntegerIVs(L); + // See if we need to create a canonical IV that starts at 0. Right now we + // only check for a Tapir loop, but this check might be generalized. + if (IsTapirLoop) + Changed |= ensureZeroStartIV(L, DL, SE, DT); + // Create a rewriter object which we'll use to transform the code with. SCEVExpander Rewriter(*SE, DL, "indvars"); #ifndef NDEBUG @@ -1986,7 +2067,7 @@ bool IndVarSimplify::run(Loop *L) { if (LI->getLoopFor(ExitingBB) != L) continue; - if (!needsLFTR(L, ExitingBB)) + if (!needsLFTR(L, ExitingBB, TI)) continue; const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); @@ -2006,7 +2087,8 @@ bool IndVarSimplify::run(Loop *L) { // Avoid high cost expansions. Note: This heuristic is questionable in // that our definition of "high cost" is not exactly principled. - if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget, + if (!IsTapirLoop && + Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget, TTI, PreHeader->getTerminator())) continue; @@ -2065,7 +2147,27 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, const DataLayout &DL = F->getDataLayout(); IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA, - WidenIndVars && AllowIVWidening); + &AR.TI, WidenIndVars && AllowIVWidening, + /*TapirLoopsOnly=*/false); + if (!IVS.run(&L)) + return PreservedAnalyses::all(); + + auto PA = getLoopPassPreservedAnalyses(); + PA.preserveSet(); + if (AR.MSSA) + PA.preserve(); + return PA; +} + +PreservedAnalyses TapirIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + Function *F = L.getHeader()->getParent(); + const DataLayout &DL = F->getParent()->getDataLayout(); + + IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA, + &AR.TI, WidenIndVars && AllowIVWidening, + /*TapirLoopsOnly=*/true); if (!IVS.run(&L)) return PreservedAnalyses::all(); diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 7a0b661a07799a8..fa623491f22e486 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -68,6 +68,7 @@ #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include #include @@ -322,6 +323,8 @@ bool JumpThreadingPass::runImpl(Function &F_, FunctionAnalysisManager *FAM_, if (!ThreadAcrossLoopHeaders) findLoopHeaders(*F); + findTapirTasks(*F, DT); + bool EverChanged = false; bool Changed; do { @@ -351,6 +354,7 @@ bool JumpThreadingPass::runImpl(Function &F_, FunctionAnalysisManager *FAM_, << '\n'); LoopHeaders.erase(&BB); LVI->eraseBlock(&BB); + TapirTasks.erase(&BB); DeleteDeadBlock(&BB, DTU.get()); Changed = ChangedSinceLastAnalysisUpdate = true; continue; @@ -380,6 +384,7 @@ bool JumpThreadingPass::runImpl(Function &F_, FunctionAnalysisManager *FAM_, } while (Changed); LoopHeaders.clear(); + TapirTasks.clear(); return EverChanged; } @@ -487,6 +492,11 @@ static unsigned getJumpThreadDuplicationCost(const TargetTransformInfo *TTI, if (CI->cannotDuplicate() || CI->isConvergent()) return ~0U; + // Bail if we discover a taskframe.end intrinsic. + // TODO: Handle taskframe.end like a guard. + if (isTapirIntrinsic(Intrinsic::taskframe_end, &*I)) + return ~0U; + if (TTI->getInstructionCost(&*I, TargetTransformInfo::TCK_SizeAndLatency) == TargetTransformInfo::TCC_Free) continue; @@ -531,6 +541,32 @@ void JumpThreadingPass::findLoopHeaders(Function &F) { LoopHeaders.insert(Edge.second); } +/// findTapirTasks - We must be careful when threading the continuation of a +/// Tapir task, in order to make sure that reattaches always go to the +/// continuation of their associated detaches. To ensure this we first record +/// all the associations between detaches and reattaches. +void JumpThreadingPass::findTapirTasks(Function &F, DominatorTree &DT) { + for (const BasicBlock &BB : F) { + if (const DetachInst *DI = dyn_cast(BB.getTerminator())) { + // Scan the predecessors of the detach continuation for reattaches that + // pair with this detach. + const BasicBlock *Detached = DI->getDetached(); + for (const BasicBlock *PredBB : predecessors(DI->getContinue())) + if (isa(PredBB->getTerminator()) && + DT.dominates(Detached, PredBB)) + TapirTasks[&BB].insert(PredBB); + + if (DI->hasUnwindDest()) + // Scan the predecessors of the detach unwind for detached-rethrows that + // pair with this detach. + for (const BasicBlock *PredBB : predecessors(DI->getUnwindDest())) + if (isDetachedRethrow(PredBB->getTerminator()) && + DT.dominates(Detached, PredBB)) + TapirTasks[&BB].insert(PredBB); + } + } +} + /// getKnownConstant - Helper method to determine if we can thread over a /// terminator with the given value as its condition, and if so what value to /// use for that. What kind of value this is depends on whether we want an @@ -1333,7 +1369,8 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { } } - if (!PredAvailable) { + if (!PredAvailable || + isa(PredBB->getTerminator())) { OneUnavailablePred = PredBB; continue; } @@ -1376,6 +1413,9 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // unconditional branch, we know that it isn't a critical edge. if (PredsScanned.size() == AvailablePreds.size()+1 && OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) { + // If the predecessor is a reattach, we can't split the edge + if (isa(OneUnavailablePred->getTerminator())) + return false; UnavailablePred = OneUnavailablePred; } else if (PredsScanned.size() != AvailablePreds.size()) { // Otherwise, we had multiple unavailable predecessors or we had a critical @@ -1389,7 +1429,8 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // Add all the unavailable predecessors to the PredsToSplit list. for (BasicBlock *P : predecessors(LoadBB)) { // If the predecessor is an indirect goto, we can't split the edge. - if (isa(P->getTerminator())) + if (isa(P->getTerminator()) || + isa(P->getTerminator())) return false; if (!AvailablePredSet.count(P)) @@ -1620,6 +1661,43 @@ bool JumpThreadingPass::processThreadableEdges(Value *Cond, BasicBlock *BB, PredToDestList.emplace_back(Pred, DestBB); } + // For Tapir, remove any edges from detaches, reattaches, or detached-rethrows + // if we are trying to thread only a subset of the the associated detaches, + // reattaches, and detached-rethrows among the predecesors. + erase_if( + PredToDestList, + [&](const std::pair &PredToDest) { + // Bail if the predecessor is not terminated by a detach. + if (isa(PredToDest.first->getTerminator())) { + // If we are threading through a detach-continue or detach-unwind, + // check that all associated reattaches and detached-rethrows are also + // predecessors in PredToDestList. + for (const BasicBlock *TaskPred : TapirTasks[PredToDest.first]) { + if (isa(TaskPred->getTerminator()) || + isDetachedRethrow(TaskPred->getTerminator())) { + return none_of( + PredToDestList, + [&](const std::pair &PredToDest) { + return TaskPred == PredToDest.first; + }); + } + } + } else if (isa(PredToDest.first->getTerminator()) || + isDetachedRethrow(PredToDest.first->getTerminator())) { + // If we have a reattach or detached-rethrow predecessor, check that + // the associated detach is also a predecessor in PredToDestList. + const BasicBlock *ReattachPred = PredToDest.first; + return none_of( + PredToDestList, + [&](const std::pair &PredToDest) { + return isa(PredToDest.first->getTerminator()) && + TapirTasks.count(PredToDest.first) && + TapirTasks[PredToDest.first].contains(ReattachPred); + }); + } + return false; + }); + // If all edges were unthreadable, we fail. if (PredToDestList.empty()) return false; @@ -1893,7 +1971,7 @@ bool JumpThreadingPass::maybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) { const Instruction *TI = SinglePred->getTerminator(); if (TI->isSpecialTerminator() || TI->getNumSuccessors() != 1 || - SinglePred == BB || hasAddressTakenAndUsed(BB)) + isa(TI) || SinglePred == BB || hasAddressTakenAndUsed(BB)) return false; // If SinglePred was a loop header, BB becomes one. @@ -2210,6 +2288,14 @@ bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB, return false; } + // Similarly, disregard cases where PredPredBB is terminated by a Tapir + // instruction. + if (isa(PredPredBB->getTerminator()) || + isa(PredPredBB->getTerminator()) || + isDetachedRethrow(PredPredBB->getTerminator()) || + isTaskFrameResume(PredPredBB->getTerminator())) + return false; + BasicBlock *SuccBB = CondBr->getSuccessor(PredPredBB == ZeroPred); // If threading to the same block as we come from, we would infinite loop. @@ -2748,7 +2834,7 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred( // Pred is a predecessor of BB with an unconditional branch to BB. SI is // a Select instruction in Pred. BB has other predecessors and SI is used in // a PHI node in BB. SI has no other use. -// A new basic block, NewBB, is created and SI is converted to compare and +// A new basic block, NewBB, is created and SI is converted to compare and // conditional branch. SI is erased from parent. void JumpThreadingPass::unfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB, SelectInst *SI, PHINode *SIUse, diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 91ef2b4b7c18391..43c0b5406f82e8a 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -56,6 +56,7 @@ #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -83,6 +84,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include #include using namespace llvm; @@ -175,13 +177,13 @@ static bool isNotUsedOrFoldableInLoop(const Instruction &I, const Loop *CurLoop, static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater &MSSAU, ScalarEvolution *SE, - OptimizationRemarkEmitter *ORE); + const TaskInfo *TI, OptimizationRemarkEmitter *ORE); static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater &MSSAU, OptimizationRemarkEmitter *ORE); static bool isSafeToExecuteUnconditionally( Instruction &Inst, const DominatorTree *DT, const TargetLibraryInfo *TLI, - const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, + const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, const TaskInfo *TI, OptimizationRemarkEmitter *ORE, const Instruction *CtxI, AssumptionCache *AC, bool AllowSpeculation); static bool pointerInvalidatedByLoop(MemorySSA *MSSA, MemoryUse *MU, @@ -218,7 +220,8 @@ struct LoopInvariantCodeMotion { bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE, MemorySSA *MSSA, - OptimizationRemarkEmitter *ORE, bool LoopNestMode = false); + TaskInfo *TI, OptimizationRemarkEmitter *ORE, + bool LoopNestMode = false); LoopInvariantCodeMotion(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap, @@ -266,7 +269,8 @@ struct LegacyLICMPass : public LoopPass { &getAnalysis().getAssumptionCache(*F), &getAnalysis().getTLI(*F), &getAnalysis().getTTI(*F), - SE ? &SE->getSE() : nullptr, MSSA, &ORE); + SE ? &SE->getSE() : nullptr, MSSA, + &getAnalysis().getTaskInfo(), &ORE); } /// This transformation requires natural loop information & requires that @@ -275,6 +279,8 @@ struct LegacyLICMPass : public LoopPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); @@ -305,7 +311,7 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, LoopInvariantCodeMotion LICM(Opts.MssaOptCap, Opts.MssaNoAccForPromotionCap, Opts.AllowSpeculation); if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.AC, &AR.TLI, &AR.TTI, - &AR.SE, AR.MSSA, &ORE)) + &AR.SE, AR.MSSA, &AR.TI, &ORE)) return PreservedAnalyses::all(); auto PA = getLoopPassPreservedAnalyses(); @@ -340,8 +346,9 @@ PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM, Opts.AllowSpeculation); Loop &OutermostLoop = LN.getOutermostLoop(); - bool Changed = LICM.runOnLoop(&OutermostLoop, &AR.AA, &AR.LI, &AR.DT, &AR.AC, - &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA, &ORE, true); + bool Changed = + LICM.runOnLoop(&OutermostLoop, &AR.AA, &AR.LI, &AR.DT, &AR.AC, &AR.TLI, + &AR.TTI, &AR.SE, AR.MSSA, &AR.TI, &ORE, true); if (!Changed) return PreservedAnalyses::all(); @@ -405,13 +412,11 @@ llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags( /// Hoist expressions out of the specified loop. Note, alias info for inner /// loop is not preserved so it is not a good idea to run LICM multiple /// times on one loop. -bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, - DominatorTree *DT, AssumptionCache *AC, - TargetLibraryInfo *TLI, - TargetTransformInfo *TTI, - ScalarEvolution *SE, MemorySSA *MSSA, - OptimizationRemarkEmitter *ORE, - bool LoopNestMode) { +bool LoopInvariantCodeMotion::runOnLoop( + Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, + AssumptionCache *AC, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + ScalarEvolution *SE, MemorySSA *MSSA, TaskInfo *TI, + OptimizationRemarkEmitter *ORE, bool LoopNestMode) { bool Changed = false; assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); @@ -459,16 +464,16 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, // us to sink instructions in one pass, without iteration. After sinking // instructions, we perform another pass to hoist them out of the loop. if (L->hasDedicatedExits()) - Changed |= - LoopNestMode - ? sinkRegionForLoopNest(DT->getNode(L->getHeader()), AA, LI, DT, - TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE) - : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L, - MSSAU, &SafetyInfo, Flags, ORE); + Changed |= LoopNestMode + ? sinkRegionForLoopNest(DT->getNode(L->getHeader()), AA, LI, + DT, TLI, TTI, L, MSSAU, &SafetyInfo, + Flags, TI, ORE) + : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, + TTI, L, MSSAU, &SafetyInfo, Flags, TI, ORE); Flags.setIsSink(false); if (Preheader) Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L, - MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode, + MSSAU, SE, &SafetyInfo, Flags, TI, ORE, LoopNestMode, LicmAllowSpeculation); // Now that all loop invariants have been removed from the loop, promote any @@ -511,7 +516,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, collectPromotionCandidates(MSSA, AA, L)) { LocalPromoted |= promoteLoopAccessesToScalars( PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI, - DT, AC, TLI, TTI, L, MSSAU, &SafetyInfo, ORE, + DT, AC, TLI, TTI, L, MSSAU, &SafetyInfo, TI, ORE, LicmAllowSpeculation, HasReadsOutsideSet); } Promoted |= LocalPromoted; @@ -542,6 +547,12 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, if (Changed && SE) SE->forgetLoopDispositions(); + + if (Changed && TI) + // Recompute task info. + // FIXME: Figure out a way to update task info that is less computationally + // wasteful. + TI->recalculate(*DT->getRoot()->getParent(), *DT); return Changed; } @@ -554,12 +565,12 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, Loop *CurLoop, MemorySSAUpdater &MSSAU, ICFLoopSafetyInfo *SafetyInfo, - SinkAndHoistLICMFlags &Flags, + SinkAndHoistLICMFlags &Flags, TaskInfo *TI, OptimizationRemarkEmitter *ORE, Loop *OutermostLoop) { // Verify inputs. assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && - CurLoop != nullptr && SafetyInfo != nullptr && + CurLoop != nullptr && SafetyInfo != nullptr && TI != nullptr && "Unexpected input to sinkRegion."); // We want to visit children before parents. We will enqueue all the parents @@ -601,7 +612,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, isNotUsedOrFoldableInLoop(I, LoopNestMode ? OutermostLoop : CurLoop, SafetyInfo, TTI, FoldableInLoop, LoopNestMode) && - canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE)) { + canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, TI, Flags, ORE)) { if (sink(I, LI, DT, CurLoop, SafetyInfo, MSSAU, ORE)) { if (!FoldableInLoop) { ++II; @@ -623,7 +634,7 @@ bool llvm::sinkRegionForLoopNest(DomTreeNode *N, AAResults *AA, LoopInfo *LI, TargetTransformInfo *TTI, Loop *CurLoop, MemorySSAUpdater &MSSAU, ICFLoopSafetyInfo *SafetyInfo, - SinkAndHoistLICMFlags &Flags, + SinkAndHoistLICMFlags &Flags, TaskInfo *TI, OptimizationRemarkEmitter *ORE) { bool Changed = false; @@ -633,7 +644,7 @@ bool llvm::sinkRegionForLoopNest(DomTreeNode *N, AAResults *AA, LoopInfo *LI, while (!Worklist.empty()) { Loop *L = Worklist.pop_back_val(); Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L, - MSSAU, SafetyInfo, Flags, ORE, CurLoop); + MSSAU, SafetyInfo, Flags, TI, ORE, CurLoop); } return Changed; } @@ -876,12 +887,12 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, TargetLibraryInfo *TLI, Loop *CurLoop, MemorySSAUpdater &MSSAU, ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, - SinkAndHoistLICMFlags &Flags, + SinkAndHoistLICMFlags &Flags, TaskInfo *TI, OptimizationRemarkEmitter *ORE, bool LoopNestMode, bool AllowSpeculation) { // Verify inputs. assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && - CurLoop != nullptr && SafetyInfo != nullptr && + CurLoop != nullptr && SafetyInfo != nullptr && TI != nullptr && "Unexpected input to hoistRegion."); ControlFlowHoister CFH(LI, DT, CurLoop, MSSAU); @@ -912,12 +923,12 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, // and we have accurately duplicated the control flow from the loop header // to that block. if (CurLoop->hasLoopInvariantOperands(&I) && - canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) && + canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, TI, Flags, ORE) && isSafeToExecuteUnconditionally( - I, DT, TLI, CurLoop, SafetyInfo, ORE, + I, DT, TLI, CurLoop, SafetyInfo, TI, ORE, Preheader->getTerminator(), AC, AllowSpeculation)) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, SE, ORE); + MSSAU, SE, TI, ORE); HoistedInstructions.push_back(&I); Changed = true; continue; @@ -945,7 +956,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, eraseInstruction(I, *SafetyInfo, MSSAU); hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), - SafetyInfo, MSSAU, SE, ORE); + SafetyInfo, MSSAU, SE, TI, ORE); HoistedInstructions.push_back(ReciprocalDivisor); Changed = true; continue; @@ -957,14 +968,14 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, match(&I, m_Intrinsic()); }; auto MustExecuteWithoutWritesBefore = [&](Instruction &I) { - return SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) && + return SafetyInfo->isGuaranteedToExecute(I, DT, TI, CurLoop) && SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop); }; if ((IsInvariantStart(I) || isGuard(&I)) && CurLoop->hasLoopInvariantOperands(&I) && MustExecuteWithoutWritesBefore(I)) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, SE, ORE); + MSSAU, SE, TI, ORE); HoistedInstructions.push_back(&I); Changed = true; continue; @@ -978,7 +989,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, PN->setIncomingBlock( i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i))); hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, SE, ORE); + MSSAU, SE, TI, ORE); assert(DT->dominates(PN, BB) && "Conditional PHIs not expected"); Changed = true; continue; @@ -1159,7 +1170,7 @@ static MemoryAccess *getClobberingMemoryAccess(MemorySSA &MSSA, bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, Loop *CurLoop, MemorySSAUpdater &MSSAU, - bool TargetExecutesOncePerLoop, + bool TargetExecutesOncePerLoop, TaskInfo *TI, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) { // If we don't understand the instruction, bail early. @@ -1563,6 +1574,11 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, CurLoop->getUniqueExitBlocks(ExitBlocks); SmallPtrSet ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end()); + + // Get the Tapir task exits for the current loop, in order to check for users + // contained in those task exits. + SmallPtrSet CurLoopTaskExits; + CurLoop->getTaskExits(CurLoopTaskExits); #endif BasicBlock *ExitBB = PN->getParent(); assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block."); @@ -1603,8 +1619,15 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, SmallSetVector PredBBs(pred_begin(ExitBB), pred_end(ExitBB)); while (!PredBBs.empty()) { BasicBlock *PredBB = *PredBBs.begin(); - assert(CurLoop->contains(PredBB) && + assert((CurLoop->contains(PredBB) || CurLoopTaskExits.count(PredBB)) && "Expect all predecessors are in the loop"); + // Don't split loop-exit predecessor blocks terminated by a detach or + // detached.rethrow. + if (isa(PredBB->getTerminator()) || + isDetachedRethrow(PredBB->getTerminator())) { + PredBBs.remove(PredBB); + continue; + } if (PN->getBasicBlockIndex(PredBB) >= 0) { BasicBlock *NewPred = SplitBlockPredecessors( ExitBB, PredBB, ".split.loop.exit", DT, LI, MSSAU, true); @@ -1632,6 +1655,11 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, bool Changed = false; LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); + // Get the Tapir task exits for the current loop, in order to check for users + // contained in those task exits. + SmallPtrSet CurLoopTaskExits; + CurLoop->getTaskExits(CurLoopTaskExits); + // Iterate over users to be ready for actual sinking. Replace users via // unreachable blocks with undef and make all user PHIs trivially replaceable. SmallPtrSet VisitedUsers; @@ -1640,7 +1668,8 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, Use &U = UI.getUse(); ++UI; - if (VisitedUsers.count(User) || CurLoop->contains(User)) + if (VisitedUsers.count(User) || CurLoop->contains(User) || + CurLoopTaskExits.count(User->getParent())) continue; if (!DT->isReachableFromEntry(User->getParent())) { @@ -1736,7 +1765,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater &MSSAU, ScalarEvolution *SE, - OptimizationRemarkEmitter *ORE) { + const TaskInfo *TI, OptimizationRemarkEmitter *ORE) { LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getNameOrAsOperand() << ": " << I << "\n"); ORE->emit([&]() { @@ -1756,7 +1785,7 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning // time in isGuaranteedToExecute if we don't actually have anything to // drop. It is a compile time optimization, not required for correctness. - !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) + !SafetyInfo->isGuaranteedToExecute(I, DT, TI, CurLoop)) I.dropUBImplyingAttrsAndMetadata(); if (isa(I)) @@ -1781,15 +1810,29 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, /// or if it is a trapping instruction and is guaranteed to execute. static bool isSafeToExecuteUnconditionally( Instruction &Inst, const DominatorTree *DT, const TargetLibraryInfo *TLI, - const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, + const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, const TaskInfo *TI, OptimizationRemarkEmitter *ORE, const Instruction *CtxI, AssumptionCache *AC, bool AllowSpeculation) { + if (CtxI) { + // Check for a load from a thread_local variable in a different spindle as + // CtxI. Loads from such variables are not safe to execute unconditionally + // outside of parallel loops. + if (LoadInst *LI = dyn_cast(&Inst)) { + if (GlobalValue *GV = dyn_cast( + getUnderlyingObject(LI->getPointerOperand()))) { + if (GV->isThreadLocal() && TI->getSpindleFor(Inst.getParent()) != + TI->getSpindleFor(CtxI->getParent())) + return false; + } + } + } + if (AllowSpeculation && isSafeToSpeculativelyExecute(&Inst, CtxI, AC, DT, TLI)) return true; bool GuaranteedToExecute = - SafetyInfo->isGuaranteedToExecute(Inst, DT, CurLoop); + SafetyInfo->isGuaranteedToExecute(Inst, DT, TI, CurLoop); if (!GuaranteedToExecute) { auto *LI = dyn_cast(&Inst); @@ -1968,12 +2011,12 @@ bool llvm::promoteLoopAccessesToScalars( SmallVectorImpl &MSSAInsertPts, PredIteratorCache &PIC, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, const TargetLibraryInfo *TLI, TargetTransformInfo *TTI, Loop *CurLoop, - MemorySSAUpdater &MSSAU, ICFLoopSafetyInfo *SafetyInfo, + MemorySSAUpdater &MSSAU, ICFLoopSafetyInfo *SafetyInfo, TaskInfo *TI, OptimizationRemarkEmitter *ORE, bool AllowSpeculation, bool HasReadsOutsideSet) { // Verify inputs. assert(LI != nullptr && DT != nullptr && CurLoop != nullptr && - SafetyInfo != nullptr && + SafetyInfo != nullptr && TI != nullptr && "Unexpected Input to promoteLoopAccessesToScalars"); LLVM_DEBUG({ @@ -2033,6 +2076,16 @@ bool llvm::promoteLoopAccessesToScalars( StoreSafetyUnknown, } StoreSafety = StoreSafetyUnknown; + // We cannot speculate loads to values that are stored in a detached + // context within the loop. Precompute whether or not there is a + // detach within this loop. + bool DetachWithinLoop = + isa(CurLoop->getHeader()->getTerminator()) || + llvm::any_of(CurLoop->getBlocks(), + [](const BasicBlock *BB) { + return isa(BB->getTerminator()); + }); + SmallVector LoopUses; // We start with an alignment of one and try to find instructions that allow @@ -2090,7 +2143,7 @@ bool llvm::promoteLoopAccessesToScalars( // alignment as well. if (!DereferenceableInPH || (InstAlignment > Alignment)) if (isSafeToExecuteUnconditionally( - *Load, DT, TLI, CurLoop, SafetyInfo, ORE, + *Load, DT, TLI, CurLoop, SafetyInfo, TI, ORE, Preheader->getTerminator(), AC, AllowSpeculation)) { DereferenceableInPH = true; Alignment = std::max(Alignment, InstAlignment); @@ -2103,6 +2156,20 @@ bool llvm::promoteLoopAccessesToScalars( if (!Store->isUnordered()) return false; + // We conservatively avoid promoting stores that are detached + // within the loop. Technically it can be legal to move these + // stores -- the program already contains a determinacy race + // -- but to preserve the serial execution, we have to avoid + // moving stores that are loaded. For now, we simply avoid + // moving these stores. + if (DetachWithinLoop && + CurLoop->contains(TI->getTaskFor(Store->getParent())->getEntry())) + return false; + + // Note that we only check GuaranteedToExecute inside the store case + // so that we do not introduce stores where they did not exist before + // (which would break the LLVM concurrency model). + SawUnorderedAtomic |= Store->isAtomic(); SawNotAtomic |= !Store->isAtomic(); @@ -2113,7 +2180,7 @@ bool llvm::promoteLoopAccessesToScalars( // raise the alignment on the promoted store. Align InstAlignment = Store->getAlign(); bool GuaranteedToExecute = - SafetyInfo->isGuaranteedToExecute(*UI, DT, CurLoop); + SafetyInfo->isGuaranteedToExecute(*UI, DT, TI, CurLoop); StoreIsGuanteedToExecute |= GuaranteedToExecute; if (GuaranteedToExecute) { DereferenceableInPH = true; diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index b0b7ae60da98842..ae46b7ac2d828a3 100644 --- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/PatternMatch.h" @@ -435,7 +436,7 @@ breakBackedgeIfNotTaken(Loop *L, DominatorTree &DT, ScalarEvolution &SE, /// instructions out of the loop. static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE, LoopInfo &LI, - MemorySSA *MSSA, + TaskInfo &TI, MemorySSA *MSSA, OptimizationRemarkEmitter &ORE) { assert(L->isLCSSAForm(DT) && "Expected LCSSA!"); @@ -475,7 +476,7 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, L->getHeader()) << "Loop deleted because it never executes"; }); - deleteDeadLoop(L, &DT, &SE, &LI, MSSA); + deleteDeadLoop(L, &DT, &SE, &LI, &TI, MSSA); ++NumDeleted; return LoopDeletionResult::Deleted; } @@ -508,7 +509,7 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, L->getHeader()) << "Loop deleted because it is invariant"; }); - deleteDeadLoop(L, &DT, &SE, &LI, MSSA); + deleteDeadLoop(L, &DT, &SE, &LI, &TI, MSSA); ++NumDeleted; return LoopDeletionResult::Deleted; @@ -525,7 +526,7 @@ PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM, // pass. Function analyses need to be preserved across loop transformations // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); - auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, AR.MSSA, ORE); + auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, AR.TI, AR.MSSA, ORE); // If we can prove the backedge isn't taken, just break it and be done. This // leaves the loop structure in place which means it can handle dispatching diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index c84e419c2a24581..1f05978ebb236fd 100644 --- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -39,6 +39,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 0ee1afa76a8234e..a9bc789ab243348 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -50,6 +50,7 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -270,6 +271,10 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, if (!LIR.runOnLoop(&L)) return PreservedAnalyses::all(); + // FIXME: Recalculating TaskInfo for the whole function is wasteful. + // Optimize this routine in the future. + AR.TI.recalculate(*AR.DT.getRoot()->getParent(), AR.DT); + auto PA = getLoopPassPreservedAnalyses(); if (AR.MSSA) PA.preserve(); diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index 489f12e689d3195..8400fde0ed55277 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -37,6 +37,7 @@ #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp index a4f2dbf9a582899..029744b4515a300 100644 --- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp @@ -12,6 +12,7 @@ #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Support/TimeProfiler.h" @@ -235,6 +236,7 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, AM.getResult(F), AM.getResult(F), AM.getResult(F), + AM.getResult(F), BFI, BPI, MSSA}; @@ -360,6 +362,7 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, PA.preserve(); if (UseMemorySSA) PA.preserve(); + PA.preserve(); return PA; } diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp index acb79e94d087c59..e098494b7634062 100644 --- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" @@ -75,9 +76,10 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, std::optional MSSAU; if (AR.MSSA) MSSAU = MemorySSAUpdater(AR.MSSA); - bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, - MSSAU ? &*MSSAU : nullptr, SQ, false, Threshold, - false, PrepareForLTO || PrepareForLTOOption); + bool Changed = + LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, + MSSAU ? &*MSSAU : nullptr, &AR.TI, SQ, false, Threshold, + false, PrepareForLTO || PrepareForLTOOption); if (!Changed) return PreservedAnalyses::all(); diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index ae9103d0608a11b..2a460a2a184adef 100644 --- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/Support/CommandLine.h" @@ -726,6 +727,11 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM, if (DeleteCurrentLoop) LPMU.markLoopAsDeleted(L, "loop-simplifycfg"); + // Recompute task info. + // FIXME: Figure out a way to update task info that is less computationally + // wasteful. + AR.TI.recalculate(*AR.DT.getRoot()->getParent(), AR.DT); + auto PA = getLoopPassPreservedAnalyses(); if (AR.MSSA) PA.preserve(); diff --git a/llvm/lib/Transforms/Scalar/LoopSink.cpp b/llvm/lib/Transforms/Scalar/LoopSink.cpp index 6eedf95e7575ec8..91736d458f7d4be 100644 --- a/llvm/lib/Transforms/Scalar/LoopSink.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSink.cpp @@ -39,6 +39,7 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/BranchProbability.h" @@ -282,7 +283,8 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, DominatorTree &DT, BlockFrequencyInfo &BFI, MemorySSA &MSSA, - ScalarEvolution *SE) { + ScalarEvolution *SE, + TaskInfo *TI) { BasicBlock *Preheader = L.getLoopPreheader(); assert(Preheader && "Expected loop to have preheader"); @@ -325,7 +327,7 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, // No need to check for instruction's operands are loop invariant. assert(L.hasLoopInvariantOperands(&I) && "Insts in a loop's preheader should have loop invariant operands!"); - if (!canSinkOrHoistInst(I, &AA, &DT, &L, MSSAU, false, LICMFlags)) + if (!canSinkOrHoistInst(I, &AA, &DT, &L, MSSAU, false, TI, LICMFlags)) continue; if (sinkInstruction(L, I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI, &MSSAU)) { @@ -351,6 +353,7 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) { AAResults &AA = FAM.getResult(F); DominatorTree &DT = FAM.getResult(F); + TaskInfo &TI = FAM.getResult(F); BlockFrequencyInfo &BFI = FAM.getResult(F); MemorySSA &MSSA = FAM.getResult(F).getMSSA(); @@ -373,7 +376,7 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) { // loops in SCEV and we don't preserve (or request) SCEV at all making that // unnecessary. Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI, MSSA, - /*ScalarEvolution*/ nullptr); + /*ScalarEvolution*/ nullptr, &TI); } while (!PreorderLoops.empty()); if (!Changed) diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 91461d1ed27592c..4c28719eeb2ddc2 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -77,6 +77,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionNormalization.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -6359,6 +6360,7 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addPreserved(); AU.addRequired(); + AU.addPreserved(); AU.addPreserved(); } @@ -7413,11 +7415,19 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { *L->getHeader()->getParent()); auto &TLI = getAnalysis().getTLI( *L->getHeader()->getParent()); + auto *TIWP = getAnalysisIfAvailable(); + auto *TI = TIWP ? &TIWP->getTaskInfo() : nullptr; auto *MSSAAnalysis = getAnalysisIfAvailable(); MemorySSA *MSSA = nullptr; if (MSSAAnalysis) MSSA = &MSSAAnalysis->getMSSA(); - return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA); + bool Changed = ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA); + if (TI && Changed) + // Recompute task info. + // FIXME: Figure out a way to update task info that is less computationally + // wasteful. + TI->recalculate(*DT.getRoot()->getParent(), DT); + return Changed; } PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM, @@ -7427,6 +7437,11 @@ PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM, AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA)) return PreservedAnalyses::all(); + // Recompute task info. + // FIXME: Figure out a way to update task info that is less computationally + // wasteful. + AR.TI.recalculate(*AR.DT.getRoot()->getParent(), AR.DT); + auto PA = getLoopPassPreservedAnalyses(); if (AR.MSSA) PA.preserve(); diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index cbc35b6dd4292aa..a3fae6d36004f66 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -32,6 +32,7 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -1166,7 +1167,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, std::optional ProvidedAllowPeeling, std::optional ProvidedAllowProfileBasedPeeling, std::optional ProvidedFullUnrollMaxCount, - AAResults *AA = nullptr) { + TaskInfo* TI, AAResults *AA = nullptr) { LLVM_DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName() << "] Loop %" @@ -1350,8 +1351,9 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, ULO.Runtime = UP.Runtime; ULO.ForgetAllSCEV = ForgetAllSCEV; ULO.Heart = getLoopConvergenceHeart(L); - LoopUnrollResult UnrollResult = UnrollLoop( - L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA); + LoopUnrollResult UnrollResult = + UnrollLoop(L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, + &RemainderLoop, AA, TI); if (UnrollResult == LoopUnrollResult::Unmodified) return LoopUnrollResult::Unmodified; @@ -1439,6 +1441,7 @@ class LoopUnroll : public LoopPass { auto &DT = getAnalysis().getDomTree(); LoopInfo *LI = &getAnalysis().getLoopInfo(); + TaskInfo *TI = &getAnalysis().getTaskInfo(); ScalarEvolution &SE = getAnalysis().getSE(); const TargetTransformInfo &TTI = getAnalysis().getTTI(F); @@ -1454,7 +1457,7 @@ class LoopUnroll : public LoopPass { /*OnlyFullUnroll*/ false, OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling, - ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount); + ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount, TI); if (Result == LoopUnrollResult::FullyUnrolled) LPM.markLoopAsDeleted(*L); @@ -1528,12 +1531,12 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM, /*Runtime*/ false, /*UpperBound*/ false, /*AllowPeeling*/ true, /*AllowProfileBasedPeeling*/ false, - /*FullUnrollMaxCount*/ std::nullopt) != + /*FullUnrollMaxCount*/ std::nullopt, &AR.TI) != LoopUnrollResult::Unmodified; if (!Changed) return PreservedAnalyses::all(); - // The parent must not be damaged by unrolling! + // The parent must not be damaged by unrolling! #ifndef NDEBUG if (ParentL) ParentL->verifyLoop(); @@ -1597,6 +1600,7 @@ PreservedAnalyses LoopUnrollPass::run(Function &F, auto &TTI = AM.getResult(F); auto &DT = AM.getResult(F); auto &AC = AM.getResult(F); + auto &TI = AM.getResult(F); auto &ORE = AM.getResult(F); AAResults &AA = AM.getResult(F); @@ -1623,6 +1627,13 @@ PreservedAnalyses LoopUnrollPass::run(Function &F, Changed |= formLCSSARecursively(*L, DT, &LI, &SE); } + if (Changed) + // Update TaskInfo manually using the updated DT. + // + // FIXME: Recalculating TaskInfo for the whole function is wasteful. + // Optimize this routine in the future. + TI.recalculate(*DT.getRoot()->getParent(), DT); + // Add the loop nests in the reverse order of LoopInfo. See method // declaration. SmallPriorityWorklist Worklist; @@ -1654,7 +1665,7 @@ PreservedAnalyses LoopUnrollPass::run(Function &F, /*Count*/ std::nullopt, /*Threshold*/ std::nullopt, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling, - UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount, + UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount, &TI, &AA); Changed |= Result != LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 663715948241d97..2bd80a440fedb7d 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -582,7 +582,7 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM, const Function *F = L.getHeader()->getParent(); OptimizationRemarkEmitter ORE(F); - LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr); + LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, LAR.TI, nullptr, nullptr); if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index c738a2a6f39a45f..9ca2a77df4969d9 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -43,6 +43,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -115,6 +116,8 @@ STATISTIC( "Number of stores rewritten into predicated loads to allow promotion"); STATISTIC(NumDeleted, "Number of instructions deleted"); STATISTIC(NumVectorized, "Number of vectorized aggregates"); +STATISTIC(NumNotParallelPromotable, "Number of alloca's not promotable due to " + "Tapir instructions"); /// Disable running mem2reg during SROA in order to test or debug SROA. static cl::opt SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false), @@ -2829,8 +2832,11 @@ class AllocaSliceRewriter : public InstVisitor { Value *rewriteIntegerLoad(LoadInst &LI) { assert(IntTy && "We cannot insert an integer to the alloca"); assert(!LI.isVolatile()); - Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, - NewAI.getAlign(), "load"); + LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, + NewAI.getAlign(), "load"); + if (LI.isAtomic()) + NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); + Value *V = NewLI; V = convertValue(DL, IRB, V, IntTy); assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; @@ -3023,6 +3029,9 @@ class AllocaSliceRewriter : public InstVisitor { Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset, V->getType(), DL)); + if (SI.isAtomic()) + Store->setAtomic(SI.getOrdering(), SI.getSyncScopeID()); + migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI, Store, Store->getPointerOperand(), Store->getValueOperand(), DL); @@ -4605,6 +4614,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // a direct store) as needing to be resplit because it is no longer // promotable. if (AllocaInst *OtherAI = dyn_cast(StoreBasePtr)) { + assert(TI->isAllocaParallelPromotable(OtherAI) && + "Alloca must be promotable"); ResplitPromotableAllocas.insert(OtherAI); Worklist.insert(OtherAI); } else if (AllocaInst *OtherAI = dyn_cast( @@ -4728,6 +4739,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { if (!SplitLoads) { if (AllocaInst *OtherAI = dyn_cast(LoadBasePtr)) { assert(OtherAI != &AI && "We can't re-split our own alloca!"); + assert(TI->isAllocaParallelPromotable(OtherAI) && + "Alloca must be promotable"); ResplitPromotableAllocas.insert(OtherAI); Worklist.insert(OtherAI); } else if (AllocaInst *OtherAI = dyn_cast( @@ -4923,6 +4936,11 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops)); } + // Check if any detaches block promotion. + if (!TI->isAllocaParallelPromotable(NewAI)) + ++NumNotParallelPromotable; + Promotable &= TI->isAllocaParallelPromotable(NewAI); + if (Promotable) { for (Use *U : AS.getDeadUsesIfPromotable()) { auto *OldInst = dyn_cast(U->get()); @@ -4933,6 +4951,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, } if (PHIUsers.empty() && SelectUsers.empty()) { // Promote the alloca. + assert(TI->isAllocaParallelPromotable(NewAI) && + "Alloca must be promotable"); PromotableAllocas.push_back(NewAI); } else { // If we have either PHIs or Selects to speculate, add them to those @@ -5561,7 +5581,7 @@ bool SROA::promoteAllocas(Function &F) { LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n"); } else { LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); - PromoteMemToReg(PromotableAllocas, DTU->getDomTree(), AC); + PromoteMemToReg(PromotableAllocas, DTU->getDomTree(), AC, TI); } PromotableAllocas.clear(); @@ -5571,16 +5591,28 @@ bool SROA::promoteAllocas(Function &F) { std::pair SROA::runSROA(Function &F) { LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); + // Scan the function to get its entry block and all entry blocks of detached + // CFG's. We can perform this scan for entry blocks once for the function, + // because this pass preserves the CFG. + SmallVector EntryBlocks; + for (Task *T : depth_first(TI->getRootTask())) { + EntryBlocks.push_back(T->getEntry()); + if (Value *TaskFrame = T->getTaskFrameUsed()) + EntryBlocks.push_back(cast(TaskFrame)->getParent()); + } + const DataLayout &DL = F.getDataLayout(); - BasicBlock &EntryBB = F.getEntryBlock(); - for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); - I != E; ++I) { - if (AllocaInst *AI = dyn_cast(I)) { - if (DL.getTypeAllocSize(AI->getAllocatedType()).isScalable() && - isAllocaPromotable(AI)) - PromotableAllocas.push_back(AI); - else - Worklist.insert(AI); + for (BasicBlock *BB : EntryBlocks) { + BasicBlock &EntryBB = *BB; + for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); + I != E; ++I) { + if (AllocaInst *AI = dyn_cast(I)) { + if (DL.getTypeAllocSize(AI->getAllocatedType()).isScalable() && + isAllocaPromotable(AI) && TI->isAllocaParallelPromotable(AI)) + PromotableAllocas.push_back(AI); + else + Worklist.insert(AI); + } } } @@ -5608,6 +5640,13 @@ std::pair SROA::runSROA(Function &F) { llvm::erase_if(PromotableAllocas, IsInSet); DeletedAllocas.clear(); } + + // Preserve TaskInfo by manually updating it based on the updated DT. + if (IterationCFGChanged && TI) { + // FIXME: Recalculating TaskInfo for the whole function is wasteful. + // Optimize this routine in the future. + TI->recalculate(F, DTU->getDomTree()); + } } Changed |= promoteAllocas(F); @@ -5632,15 +5671,17 @@ std::pair SROA::runSROA(Function &F) { PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) { DominatorTree &DT = AM.getResult(F); AssumptionCache &AC = AM.getResult(F); + TaskInfo& TI = AM.getResult(F); DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); auto [Changed, CFGChanged] = - SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F); + SROA(&F.getContext(), &DTU, &AC, &TI, PreserveCFG).runSROA(F); if (!Changed) return PreservedAnalyses::all(); PreservedAnalyses PA; if (!CFGChanged) PA.preserveSet(); PA.preserve(); + PA.preserve(); return PA; } @@ -5676,16 +5717,19 @@ class SROALegacyPass : public FunctionPass { AssumptionCache &AC = getAnalysis().getAssumptionCache(F); DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + TaskInfo& TI = getAnalysis().getTaskInfo(); auto [Changed, _] = - SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F); + SROA(&F.getContext(), &DTU, &AC, &TI, PreserveCFG).runSROA(F); return Changed; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); + AU.addPreserved(); } StringRef getPassName() const override { return "SROA"; } @@ -5704,5 +5748,6 @@ INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", false, false) diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index c235d2fb2a5bd46..02b2060168d76d7 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -29,6 +29,7 @@ #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" @@ -42,6 +43,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Use.h" @@ -58,6 +60,7 @@ #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include #include @@ -1205,6 +1208,13 @@ static BasicBlock *buildClonedLoopBlocks( if (!SkipBlock(LoopBB)) CloneBlock(LoopBB); + // Clone any task-exit blocks in the loop as well. + SmallPtrSet TaskExitBlocks; + L.getTaskExits(TaskExitBlocks); + for (auto *LoopBB : TaskExitBlocks) + if (!SkipBlock(LoopBB)) + CloneBlock(LoopBB); + // Split all the loop exit edges so that when we clone the exit blocks, if // any of the exit blocks are *also* a preheader for some other loop, we // don't create multiple predecessors entering the loop header. @@ -2175,8 +2185,9 @@ void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName, static void unswitchNontrivialInvariants( Loop &L, Instruction &TI, ArrayRef Invariants, IVConditionInfo &PartialIVInfo, DominatorTree &DT, LoopInfo &LI, - AssumptionCache &AC, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, - LPMUpdater &LoopUpdater, bool InsertFreeze, bool InjectedCondition) { + AssumptionCache &AC, ScalarEvolution *SE, TaskInfo *TaskI, + MemorySSAUpdater *MSSAU, LPMUpdater &LoopUpdater, bool InsertFreeze, + bool InjectedCondition) { auto *ParentBB = TI.getParent(); BranchInst *BI = dyn_cast(&TI); SwitchInst *SI = BI ? nullptr : cast(&TI); @@ -2250,7 +2261,7 @@ static void unswitchNontrivialInvariants( // Compute the parent loop now before we start hacking on things. Loop *ParentL = L.getParentLoop(); // Get blocks in RPO order for MSSA update, before changing the CFG. - LoopBlocksRPO LBRPO(&L); + LoopBlocksRPO LBRPO(&L, /*IncludeTaskExits*/ true); if (MSSAU) LBRPO.perform(&LI); @@ -2335,7 +2346,7 @@ static void unswitchNontrivialInvariants( // guaranteed no reach implicit null check after following this branch. ICFLoopSafetyInfo SafetyInfo; SafetyInfo.computeLoopSafetyInfo(&L); - if (!SafetyInfo.isGuaranteedToExecute(TI, &DT, &L)) + if (!SafetyInfo.isGuaranteedToExecute(TI, &DT, TaskI, &L)) TI.setMetadata(LLVMContext::MD_make_implicit, nullptr); } } @@ -3269,13 +3280,36 @@ static bool collectUnswitchCandidatesWithInjections( return Found; } +static bool +checkTapirSyncRegionInLoop(const Loop &L, + const SmallPtrSetImpl &TaskExits, + const Instruction &I) { + for (const User *Usr : I.users()) + if (const Instruction *UsrI = dyn_cast(Usr)) { + const BasicBlock *Parent = UsrI->getParent(); + if (!L.contains(Parent) && !TaskExits.contains(Parent)) + return false; + } + return true; +} + static bool isSafeForNoNTrivialUnswitching(Loop &L, LoopInfo &LI) { if (!L.isSafeToClone()) return false; + SmallPtrSet TaskExits; + L.getTaskExits(TaskExits); for (auto *BB : L.blocks()) for (auto &I : *BB) { - if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) + if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) { + if (isTapirIntrinsic(Intrinsic::syncregion_start, &I)) { + if (!checkTapirSyncRegionInLoop(L, TaskExits, I)) + return false; + // All uses of this syncregion.start are inside of the loop, so it's + // safe for unswitching. + continue; + } return false; + } if (auto *CB = dyn_cast(&I)) { assert(!CB->cannotDuplicate() && "Checked by L.isSafeToClone()."); if (CB->isConvergent()) @@ -3289,7 +3323,7 @@ static bool isSafeForNoNTrivialUnswitching(Loop &L, LoopInfo &LI) { // loops "out of thin air". If we ever discover important use cases for doing // this, we can add support to loop unswitch, but it is a lot of complexity // for what seems little or no real world benefit. - LoopBlocksRPO RPOT(&L); + LoopBlocksRPO RPOT(&L, /*IncludeTaskExits*/ true); RPOT.perform(&LI); if (containsIrreducibleCFG(RPOT, LI)) return false; @@ -3470,14 +3504,14 @@ static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate( // of the loop. Insert a freeze to prevent this case. // 3. The branch condition may be poison or undef static bool shouldInsertFreeze(Loop &L, Instruction &TI, DominatorTree &DT, - AssumptionCache &AC) { + AssumptionCache &AC, TaskInfo *TaskI) { assert(isa(TI) || isa(TI)); if (!FreezeLoopUnswitchCond) return false; ICFLoopSafetyInfo SafetyInfo; SafetyInfo.computeLoopSafetyInfo(&L); - if (SafetyInfo.isGuaranteedToExecute(TI, &DT, &L)) + if (SafetyInfo.isGuaranteedToExecute(TI, &DT, TaskI, &L)) return false; Value *Cond; @@ -3492,7 +3526,7 @@ static bool shouldInsertFreeze(Loop &L, Instruction &TI, DominatorTree &DT, static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, AAResults &AA, TargetTransformInfo &TTI, ScalarEvolution *SE, - MemorySSAUpdater *MSSAU, + TaskInfo *TaskI, MemorySSAUpdater *MSSAU, LPMUpdater &LoopUpdater) { // Collect all invariant conditions within this loop (as opposed to an inner // loop which would be handled when visiting that inner loop). @@ -3550,14 +3584,14 @@ static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, if (isGuard(Best.TI)) Best.TI = turnGuardIntoBranch(cast(Best.TI), L, DT, LI, MSSAU); - InsertFreeze = shouldInsertFreeze(L, *Best.TI, DT, AC); + InsertFreeze = shouldInsertFreeze(L, *Best.TI, DT, AC, TaskI); } LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = " << Best.Cost << ") terminator: " << *Best.TI << "\n"); unswitchNontrivialInvariants(L, *Best.TI, Best.Invariants, PartialIVInfo, DT, - LI, AC, SE, MSSAU, LoopUpdater, InsertFreeze, - InjectedCondition); + LI, AC, SE, TaskI, MSSAU, LoopUpdater, + InsertFreeze, InjectedCondition); return true; } @@ -3585,7 +3619,7 @@ static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, AAResults &AA, TargetTransformInfo &TTI, bool Trivial, - bool NonTrivial, ScalarEvolution *SE, + bool NonTrivial, ScalarEvolution *SE, TaskInfo* TaskI, MemorySSAUpdater *MSSAU, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, LPMUpdater &LoopUpdater) { assert(L.isRecursivelyLCSSAForm(DT, LI) && @@ -3670,7 +3704,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, // Try to unswitch the best invariant condition. We prefer this full unswitch to // a partial unswitch when possible below the threshold. - if (unswitchBestCondition(L, DT, LI, AC, AA, TTI, SE, MSSAU, LoopUpdater)) + if (unswitchBestCondition(L, DT, LI, AC, AA, TTI, SE, TaskI, MSSAU, LoopUpdater)) return true; // No other opportunities to unswitch. @@ -3697,7 +3731,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, AR.MSSA->verifyMemorySSA(); } if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.AA, AR.TTI, Trivial, NonTrivial, - &AR.SE, MSSAU ? &*MSSAU : nullptr, PSI, AR.BFI, U)) + &AR.SE, &AR.TI, MSSAU ? &*MSSAU : nullptr, PSI, AR.BFI, U)) return PreservedAnalyses::all(); if (AR.MSSA && VerifyMemorySSA) @@ -3707,6 +3741,11 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, // in asserts builds. assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast)); + // Recompute task info. + // FIXME: Figure out a way to update task info that is less computationally + // wasteful. + AR.TI.recalculate(F, AR.DT); + auto PA = getLoopPassPreservedAnalyses(); if (AR.MSSA) PA.preserve(); diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 11de37f7a7c1094..4b563ad20bca0d4 100644 --- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -39,6 +39,9 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" #include "llvm/Transforms/Utils/Local.h" @@ -222,6 +225,73 @@ static bool tailMergeBlocksWithSimilarFunctionTerminators(Function &F, return Changed; } +static bool removeUselessSyncs(Function &F, DomTreeUpdater *DTU) { + bool Changed = false; + // Scan all the blocks in the function + check: + for (BasicBlock &BB : make_early_inc_range(F)) { + if (DTU && DTU->isBBPendingDeletion(&BB)) + continue; + if (SyncInst *Sync = dyn_cast(BB.getTerminator())) { + // Walk the CFG backwards to try to find a reaching detach instruction. + bool ReachingDetach = false; + SmallPtrSet Visited; + SmallVector WorkList; + WorkList.push_back(&BB); + while (!WorkList.empty()) { + BasicBlock *PBB = WorkList.pop_back_val(); + if (!Visited.insert(PBB).second) + continue; + + for (pred_iterator PI = pred_begin(PBB), PE = pred_end(PBB); + PI != PE; ++PI) { + BasicBlock *Pred = *PI; + Instruction *PT = Pred->getTerminator(); + // Stop the traversal at the entry block of a detached CFG. + if (DetachInst *DI = dyn_cast(PT)) { + if (DI->getDetached() == PBB) + continue; + else if (DI->getSyncRegion() == Sync->getSyncRegion()) + // This detach reaches the sync through the continuation edge. + ReachingDetach = true; + } + if (ReachingDetach) + break; + + // Ignore predecessors via a reattach, which belong to child detached + // contexts. + if (isa(PT) || isDetachedRethrow(PT)) + continue; + + // For a predecessor terminated by a sync instruction, check the sync + // region it belongs to. If the sync belongs to the same sync region, + // ignore the predecessor. + if (SyncInst *SI = dyn_cast(PT)) + if (SI->getSyncRegion() == Sync->getSyncRegion()) + continue; + + WorkList.push_back(Pred); + } + } + + // If no detach reaches this sync, then this sync can be removed. + if (!ReachingDetach) { + BasicBlock* Succ = Sync->getSuccessor(0); + const Value *SyncReg = Sync->getSyncRegion(); + Instruction *MaybeSyncUnwind = Succ->getFirstNonPHIOrDbgOrLifetime(); + ReplaceInstWithInst(Sync, BranchInst::Create(Succ)); + Changed = true; + bool Recheck = false; + if (isSyncUnwind(MaybeSyncUnwind, SyncReg)) + Recheck |= removeDeadSyncUnwind(cast(MaybeSyncUnwind), DTU); + Recheck |= MergeBlockIntoPredecessor(Succ, DTU); + if (Recheck) goto check; + } + } + } + return Changed; +} + /// Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, @@ -276,6 +346,7 @@ static bool simplifyFunctionCFGImpl(Function &F, const TargetTransformInfo &TTI, EverChanged |= tailMergeBlocksWithSimilarFunctionTerminators(F, DT ? &DTU : nullptr); EverChanged |= iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options); + EverChanged |= removeUselessSyncs(F, DT ? &DTU : nullptr); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -291,6 +362,7 @@ static bool simplifyFunctionCFGImpl(Function &F, const TargetTransformInfo &TTI, do { EverChanged = iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options); EverChanged |= removeUnreachableBlocks(F, DT ? &DTU : nullptr); + EverChanged |= removeUselessSyncs(F, DT ? &DTU : nullptr); } while (EverChanged); return true; diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index 1b3e6d9549b825f..c10fb6c9de03de4 100644 --- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -72,6 +72,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -79,6 +80,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/TapirUtils.h" using namespace llvm; #define DEBUG_TYPE "tailcallelim" @@ -430,6 +432,9 @@ class TailRecursionEliminator { // The instruction doing the accumulating. Instruction *AccumulatorRecursionInstr = nullptr; + // Map from sync region to return blocks to sync for that sync region. + DenseMap> ReturnBlocksToSync; + TailRecursionEliminator(Function &F, const TargetTransformInfo *TTI, AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) @@ -443,6 +448,8 @@ class TailRecursionEliminator { bool eliminateCall(CallInst *CI); + void InsertSyncsIntoReturnBlocks(); + void cleanupAndFinalize(); bool processBlock(BasicBlock &BB); @@ -517,10 +524,17 @@ void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) { // Move all fixed sized allocas from HeaderBB to NewEntry. for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(), NEBI = NewEntry->begin(); - OEBI != E;) - if (AllocaInst *AI = dyn_cast(OEBI++)) + OEBI != E;) { + auto I = OEBI++; + if (AllocaInst *AI = dyn_cast(I)) { if (isa(AI->getArraySize())) AI->moveBefore(&*NEBI); + } else if (IntrinsicInst *II = dyn_cast(I)) { + // Also move syncregions to NewEntry. + if (Intrinsic::syncregion_start == II->getIntrinsicID()) + II->moveBefore(&*NEBI); + } + } // Now that we have created a new block, which jumps to the entry // block, insert a PHI node for each argument of the function. @@ -816,6 +830,104 @@ void TailRecursionEliminator::cleanupAndFinalize() { } } +static void +getReturnBlocksToSync(BasicBlock *Entry, SyncInst *Sync, + SmallPtrSetImpl &ReturnBlocksToSync) { + // Walk the CFG from the entry block, stopping traversal at any sync within + // the same region. Record all blocks found that are terminated by a return + // instruction. + Value *SyncRegion = Sync->getSyncRegion(); + SmallVector WorkList; + SmallPtrSet Visited; + WorkList.push_back(Entry); + while (!WorkList.empty()) { + BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + // Skip paths that are synced within the same region. + if (SyncInst *SI = dyn_cast(BB->getTerminator())) + if (SI->getSyncRegion() == SyncRegion) + continue; + + // If we find a return, we must add a sync before it if we eliminate a + // recursive tail call. + if (isa(BB->getTerminator())) + ReturnBlocksToSync.insert(BB); + + // Queue up successors to search. + for (BasicBlock *Succ : successors(BB)) + if (Succ != Sync->getParent()) + WorkList.push_back(Succ); + } +} + +static bool hasPrecedingSync(SyncInst *SI) { + // TODO: Save the results from previous calls to hasPrecedingSync, in order to + // speed up multiple calls to this routine for different sync instructions. + SmallPtrSet Visited; + SmallVector Worklist; + Worklist.push_back(SI); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + if (!Visited.insert(I->getParent()).second) + continue; + + // Scan the basic block in reverse for a taskframe.end. If found, skip the + // search to the corresponding taskframe.create(). + BasicBlock::iterator Iter(I); + BasicBlock::const_iterator BBStart(I->getParent()->begin()); + bool FoundPred = false; + while (Iter != BBStart) { + Instruction *I = &*Iter; + if (isTapirIntrinsic(Intrinsic::taskframe_end, I)) { + CallInst *TFEnd = cast(I); + Instruction *TaskFrame = cast(TFEnd->getArgOperand(0)); + if (TaskFrame->getParent() == I->getParent()) { + Iter = TaskFrame->getIterator(); + continue; + } + Worklist.push_back(TaskFrame); + FoundPred = true; + break; + } + Iter--; + } + + // If this block contains a taskframe.end whose taskframe.create exists in + // another block, then we're done with this block. + if (FoundPred) + continue; + + // Add predecessors of this block to the search, based on their terminators. + for (BasicBlock *Pred : predecessors(I->getParent())) { + Instruction *TI = Pred->getTerminator(); + // If we find a sync, then the searchis done. + if (isa(TI)) + return true; + + // Skip predecessors terminated by reattaches or detached.rethrows. This + // block will also have a detach as its predecessor, where we'll continue + // the search. + if (isa(TI) || isDetachedRethrow(TI)) + continue; + + // If we find a taskframe.resume, jump the search to the corresponding + // taskframe.create. + if (isTaskFrameResume(TI)) { + CallBase *CB = dyn_cast(TI); + Instruction *TaskFrame = cast(CB->getArgOperand(0)); + Worklist.push_back(TaskFrame); + continue; + } + // Otherwise, add the terminator to the worklist. + Worklist.push_back(TI); + } + } + // We finished the search and did not find a preceding sync. + return false; +} + bool TailRecursionEliminator::processBlock(BasicBlock &BB) { Instruction *TI = BB.getTerminator(); @@ -854,11 +966,163 @@ bool TailRecursionEliminator::processBlock(BasicBlock &BB) { if (CI) return eliminateCall(CI); + } else if (SyncInst *SI = dyn_cast(TI)) { + + BasicBlock *Succ = SI->getSuccessor(0); + // If the successor is terminated by a sync.unwind (which will necessarily + // be an invoke), skip TRE. + if (isSyncUnwind(Succ->getTerminator())) + return false; + + // Try to find a return instruction in the block following a sync. + Instruction *NextI = Succ->getFirstNonPHIOrDbgOrSyncUnwind(true); + Instruction *TapirRuntimeToRemove = nullptr; + if (isTapirIntrinsic(Intrinsic::tapir_runtime_end, NextI)) { + TapirRuntimeToRemove = + cast(cast(NextI)->getArgOperand(0)); + NextI = &*(++NextI->getIterator()); + } + ReturnInst *Ret = dyn_cast(NextI); + + BasicBlock *BrSucc = nullptr; + if (!Ret) { + // After the sync, there might be a block with a sync.unwind instruction + // and an unconditional branch to a block containing just a return. Check + // for this structure. + if (BranchInst *BI = dyn_cast(NextI)) { + if (BI->isConditional()) + return false; + + BrSucc = BI->getSuccessor(0); + Ret = dyn_cast(BrSucc->getFirstNonPHIOrDbg(true)); + } + } + if (!Ret) + return false; + + CallInst *CI = findTRECandidate(&BB); + + if (!CI) + return false; + + // Check that all instructions between the candidate tail call and the sync + // can be moved above the call. In particular, we disallow accumulator + // recursion elimination for tail calls before a sync. + BasicBlock::iterator BBI(CI); + for (++BBI; &*BBI != SI; ++BBI) + if (!canMoveAboveCall(&*BBI, CI, AA)) + break; + if (&*BBI != SI) + return false; + + // Get the sync region for this sync. + Value *SyncRegion = SI->getSyncRegion(); + BasicBlock *OldEntryBlock = &BB.getParent()->getEntryBlock(); + + // Check that the sync region begins in the entry block of the function. + if (cast(SyncRegion)->getParent() != OldEntryBlock) { + LLVM_DEBUG(dbgs() << "Cannot eliminate tail call " << *CI + << ": sync region does not start in entry block."); + return false; + } + + // Check for preceding syncs, since TRE would cause those syncs to + // synchronize any computations that this sync currently syncs. + if (hasPrecedingSync(SI)) + return false; + + // Get returns reachable from newly created loop. + getReturnBlocksToSync(OldEntryBlock, SI, ReturnBlocksToSync[SyncRegion]); + + // If we found a tapir.runtime.end intrinsic between the sync and return, + // remove it. + if (TapirRuntimeToRemove) { + SmallVector ToErase; + for (User *U : TapirRuntimeToRemove->users()) { + if (Instruction *I = dyn_cast(U)) { + if (!isTapirIntrinsic(Intrinsic::tapir_runtime_end, I)) + return false; + ToErase.push_back(I); + } + } + LLVM_DEBUG(dbgs() << "ERASING: " << *TapirRuntimeToRemove << "\n"); + for (Instruction *I : ToErase) + I->eraseFromParent(); + TapirRuntimeToRemove->eraseFromParent(); + } + + // If we found a sync.unwind and unconditional branch between the sync and + // return, first fold the return into this unconditional branch. + if (BrSucc) { + LLVM_DEBUG(dbgs() << "FOLDING: " << *BrSucc + << "INTO UNCOND BRANCH PRED: " << *Succ); + FoldReturnIntoUncondBranch(Ret, BrSucc, Succ, &DTU); + } + + // Fold the return into the sync. + LLVM_DEBUG(dbgs() << "FOLDING: " << *Succ << "INTO SYNC PRED: " << BB); + FoldReturnIntoUncondBranch(Ret, Succ, &BB, &DTU); + ++NumRetDuped; + + // If all predecessors of Succ have been eliminated by + // FoldReturnIntoUncondBranch, delete it. It is important to empty it, + // because the ret instruction in there is still using a value which + // eliminateCall will attempt to remove. This block can only contain + // instructions that can't have uses, therefore it is safe to remove. + if (pred_empty(Succ)) + DTU.deleteBB(Succ); + + bool EliminatedCall = eliminateCall(CI); + + // If a recursive tail was eliminated, fix up the syncs and sync region in + // the CFG. + if (EliminatedCall) { + // We defer the restoration of syncs at relevant return blocks until after + // all blocks are processed. This approach simplifies the logic for + // eliminating multiple tail calls that are only separated from the return + // by a sync, since the CFG won't be perturbed unnecessarily. + } else { + // Restore the sync that was eliminated. + BasicBlock *RetBlock = Ret->getParent(); + BasicBlock *NewRetBlock = SplitBlock(RetBlock, Ret, &DTU); + ReplaceInstWithInst(RetBlock->getTerminator(), + SyncInst::Create(NewRetBlock, SyncRegion)); + // The earlier call to FoldReturnIntoUncondBranch did not remove the + // sync.unwind, so there's nothing to do to restore the sync.unwind. + } + + return EliminatedCall; } return false; } +void TailRecursionEliminator::InsertSyncsIntoReturnBlocks() { + Function *SyncUnwindFn = + Intrinsic::getDeclaration(F.getParent(), Intrinsic::sync_unwind); + BasicBlock &NewEntry = F.getEntryBlock(); + + for (auto ReturnsToSync : ReturnBlocksToSync) { + Value *SyncRegion = ReturnsToSync.first; + SmallPtrSetImpl &ReturnBlocks = ReturnsToSync.second; + + // Move the sync region start to the new entry block. + cast(SyncRegion)->moveBefore(&*(NewEntry.begin())); + + // Insert syncs before relevant return blocks. + for (BasicBlock *RetBlock : ReturnBlocks) { + BasicBlock *NewRetBlock = + SplitBlock(RetBlock, RetBlock->getTerminator(), &DTU); + ReplaceInstWithInst(RetBlock->getTerminator(), + SyncInst::Create(NewRetBlock, SyncRegion)); + + if (!F.doesNotThrow()) + CallInst::Create(SyncUnwindFn, {SyncRegion}, "", + NewRetBlock->getTerminator()); + } + } +} + bool TailRecursionEliminator::eliminate(Function &F, const TargetTransformInfo *TTI, AliasAnalysis *AA, @@ -884,6 +1148,9 @@ bool TailRecursionEliminator::eliminate(Function &F, for (BasicBlock &BB : F) MadeChange |= TRE.processBlock(BB); + if (!TRE.ReturnBlocksToSync.empty()) + TRE.InsertSyncsIntoReturnBlocks(); + TRE.cleanupAndFinalize(); return MadeChange; diff --git a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp index e53019768e88190..ab75cef5a1031ef 100644 --- a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp +++ b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp @@ -80,6 +80,20 @@ static void warnAboutLeftoverTransformations(Loop *L, "requested transformation; the transformation might be disabled or " "specified as part of an unsupported transformation ordering"); } + + if (hasLoopStripmineTransformation(L) == TM_ForcedByUser) { + LLVM_DEBUG(dbgs() << "Leftover loop-stripmine transformation\n"); + ORE->emit( + DiagnosticInfoOptimizationFailure(DEBUG_TYPE, + "FailedRequestedStripmining", + L->getStartLoc(), L->getHeader()) + << "loop not stripmined: the optimizer was unable to perform the " + "requested transformation; the transformation might be disabled or " + "specified as part of an unsupported transformation ordering"); + } + + // This pass doesn't check whether LoopSpawning has been performed, because + // LoopSpawning runs as part of Tapir lowering, after this pass has run. } static void warnAboutLeftoverTransformations(Function *F, LoopInfo *LI, diff --git a/llvm/lib/Transforms/Tapir/CMakeLists.txt b/llvm/lib/Transforms/Tapir/CMakeLists.txt new file mode 100644 index 000000000000000..01932b3d3d18683 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/CMakeLists.txt @@ -0,0 +1,39 @@ +add_llvm_component_library(LLVMTapirOpts + DRFScopedNoAliasAA.cpp + LambdaABI.cpp + LoopSpawningTI.cpp + LoopStripMine.cpp + LoopStripMinePass.cpp + LoweringUtils.cpp + OMPTaskABI.cpp + OpenCilkABI.cpp + Outline.cpp + QthreadsABI.cpp + SerialABI.cpp + SerializeSmallTasks.cpp + Tapir.cpp + TapirToTarget.cpp + TapirLoopInfo.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Tapir + + DEPENDS + intrinsics_gen + LLVMLinker + + COMPONENT_NAME + TapirOpts + + LINK_COMPONENTS + Analysis + Core + IRReader + Linker + MC + Scalar + Support + TransformUtils + Vectorize + ) diff --git a/llvm/lib/Transforms/Tapir/DRFScopedNoAliasAA.cpp b/llvm/lib/Transforms/Tapir/DRFScopedNoAliasAA.cpp new file mode 100644 index 000000000000000..d590406be7e5a01 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/DRFScopedNoAliasAA.cpp @@ -0,0 +1,332 @@ +//===- DRFScopedNoAliasAA.cpp - DRF-based scoped-noalias metadata ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Adds scoped-noalias metadata to memory accesses based on Tapir's parallel +// control flow constructs and the assumption that the function is data-race +// free. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/DRFScopedNoAliasAA.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Tapir.h" + +#define DEBUG_TYPE "drf-scoped-noalias" + +using namespace llvm; + +/// Process Tapir loops within the given function for loop spawning. +class DRFScopedNoAliasImpl { +public: + DRFScopedNoAliasImpl(Function &F, TaskInfo &TI, AliasAnalysis &AA, + LoopInfo *LI) + : F(F), TI(TI), LI(LI) { + TI.evaluateParallelState(MPTasks); + } + + bool run(); + +private: + bool populateTaskScopeNoAlias(); + + bool populateSubTaskScopeNoAlias( + const Task *T, MDBuilder &MDB, SmallVectorImpl &CurrScopes, + SmallVectorImpl &CurrNoAlias, + DenseMap &TaskToScope); + + bool populateTaskScopeNoAliasInBlock( + const Task *T, BasicBlock *BB, MDBuilder &MDB, + SmallVectorImpl &Scopes, + SmallVectorImpl &NoAlias); + + Function &F; + TaskInfo &TI; + LoopInfo *LI; + + MaybeParallelTasks MPTasks; +}; + +namespace { +struct DRFScopedNoAliasWrapperPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + explicit DRFScopedNoAliasWrapperPass() : FunctionPass(ID) { + initializeDRFScopedNoAliasWrapperPassPass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { + return "Assume DRF to Add Scoped-No-Alias Metadata"; + } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + } +}; +} // End of anonymous namespace + +char DRFScopedNoAliasWrapperPass::ID = 0; +INITIALIZE_PASS_BEGIN(DRFScopedNoAliasWrapperPass, "drf-scoped-noalias", + "Add DRF-based scoped-noalias metadata", + false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) +INITIALIZE_PASS_END(DRFScopedNoAliasWrapperPass, "drf-scoped-noalias", + "Add DRF-based scoped-noalias metadata", + false, false) + +bool DRFScopedNoAliasImpl::populateTaskScopeNoAliasInBlock( + const Task *T, BasicBlock *BB, MDBuilder &MDB, + SmallVectorImpl &Scopes, SmallVectorImpl &NoAlias) { + LLVM_DEBUG(dbgs() << "Processing block " << BB->getName() << " in task " + << T->getEntry()->getName() << "\n"); + for (Instruction &I : *BB) { + bool IsArgMemOnlyCall = false, IsFuncCall = false; + SmallVector PtrArgs; + + if (const LoadInst *LI = dyn_cast(&I)) + PtrArgs.push_back(LI->getPointerOperand()); + else if (const StoreInst *SI = dyn_cast(&I)) + PtrArgs.push_back(SI->getPointerOperand()); + else if (const VAArgInst *VAAI = dyn_cast(&I)) + PtrArgs.push_back(VAAI->getPointerOperand()); + else if (const AtomicCmpXchgInst *CXI = dyn_cast(&I)) + PtrArgs.push_back(CXI->getPointerOperand()); + else if (const AtomicRMWInst *RMWI = dyn_cast(&I)) + PtrArgs.push_back(RMWI->getPointerOperand()); + else if (const CallBase *ICS = dyn_cast(&I)) { + // We don't need to worry about callsites that don't access memory. + if (ICS->doesNotAccessMemory()) + continue; + + IsFuncCall = true; + if (ICS->onlyAccessesArgMemory()) + IsArgMemOnlyCall = true; + + for (Value *Arg : ICS->args()) { + // We need to check the underlying objects of all arguments, not just + // the pointer arguments, because we might be passing pointers as + // integers, etc. + // However, if we know that the call only accesses pointer arguments, + // then we only need to check the pointer arguments. + if (IsArgMemOnlyCall && !Arg->getType()->isPointerTy()) + continue; + + PtrArgs.push_back(Arg); + } + } + + // If we found no pointers, then this instruction is not suitable for + // pairing with an instruction to receive aliasing metadata. However, if + // this is a call, this we might just alias with none of the noalias + // arguments. + if (PtrArgs.empty() && !IsFuncCall) + continue; + + // It is possible that there is only one underlying object, but you need to + // go through several PHIs to see it, and thus could be repeated in the + // Objects list. + bool UsesObjectOutsideTask = false; + for (const Value *V : PtrArgs) { + SmallVector Objects; + getUnderlyingObjects(const_cast(V), Objects, LI); + + for (const Value *O : Objects) { + LLVM_DEBUG(dbgs() << "Checking object " << *O << "\n"); + // Check if this value is a constant that cannot be derived from any + // pointer value (we need to exclude constant expressions, for example, + // that are formed from arithmetic on global symbols). + bool IsNonPtrConst = isa(V) || isa(V) || + isa(V) || + isa(V) || isa(V); + if (IsNonPtrConst) + continue; + + // Check if this object was created in this task. + if (const Instruction *OI = dyn_cast(O)) + if (TI.getTaskFor(OI->getParent()) == T) + continue; + + // This object exists outside the task. + UsesObjectOutsideTask = true; + break; + } + // Quit early if a pointer argument is found that refers to an object + // allocated outside of this task. + if (UsesObjectOutsideTask) + break; + } + + // If this instruction does not refer to an object outside of the task, + // don't add noalias metadata. + if (!UsesObjectOutsideTask) { + LLVM_DEBUG(dbgs() << "Instruction " << I + << " does not use object outside of task " + << T->getEntry()->getName() << "\n"); + continue; + } + + if (!NoAlias.empty()) + I.setMetadata(LLVMContext::MD_noalias, + MDNode::concatenate( + I.getMetadata(LLVMContext::MD_noalias), + MDNode::get(F.getContext(), NoAlias))); + + if (!Scopes.empty()) + I.setMetadata( + LLVMContext::MD_alias_scope, + MDNode::concatenate(I.getMetadata(LLVMContext::MD_alias_scope), + MDNode::get(F.getContext(), Scopes))); + } + return true; +} + +bool DRFScopedNoAliasImpl::populateSubTaskScopeNoAlias( + const Task *T, MDBuilder &MDB, SmallVectorImpl &CurrScopes, + SmallVectorImpl &CurrNoAlias, + DenseMap &TaskToScope) { + bool Changed = false; + size_t OrigNoAliasSize = CurrNoAlias.size(); + + // FIXME? Separately handle shared EH spindles. + for (Spindle *S : depth_first>(T->getEntrySpindle())) { + for (const Task *MPT : MPTasks.TaskList[S]) { + // Don't record noalias scopes for maybe-parallel tasks that enclose the + // spindle. These cases arise from parallel loops, which need special + // alias analysis anyway (e.g., LoopAccessAnalysis). + if (!MPT->encloses(S->getEntry())) + CurrNoAlias.push_back(TaskToScope[MPT]); + } + // Populate instructions in spindle with scoped-noalias information. + for (BasicBlock *BB : S->blocks()) + Changed |= + populateTaskScopeNoAliasInBlock(T, BB, MDB, CurrScopes, CurrNoAlias); + + // Remove the noalias scopes for this spindle. + CurrNoAlias.erase(CurrNoAlias.begin() + OrigNoAliasSize, CurrNoAlias.end()); + + // For each successor spindle in a subtask, recursively populate the + // scoped-noalias information in that subtask. + for (Spindle *Succ : successors(S)) { + if (S->succInSubTask(Succ)) { + CurrScopes.push_back(TaskToScope[Succ->getParentTask()]); + populateSubTaskScopeNoAlias(Succ->getParentTask(), MDB, CurrScopes, + CurrNoAlias, TaskToScope); + CurrScopes.pop_back(); + } + } + } + + return Changed; +} + +static void createTaskDomainsAndFullScopes( + const Task *T, MDBuilder &MDB, const Twine ParentName, + DenseMap &TaskToDomain, + DenseMap &TaskToScope) { + // Within the domain of T, create a scope and domain for each subtask. + for (const Task *SubT : T->subtasks()) { + const Twine Name = ParentName + "_" + SubT->getEntry()->getName(); + + MDNode *NewScope = + MDB.createAnonymousAliasScope(TaskToDomain[T], ("taskscp_" + Name).str()); + TaskToScope[SubT] = NewScope; + MDNode *NewDomain = + MDB.createAnonymousAliasScopeDomain(("taskdom_" + Name).str()); + TaskToDomain[SubT] = NewDomain; + + // Recursively create domains and scopes for subtasks. + createTaskDomainsAndFullScopes(SubT, MDB, Name, TaskToDomain, TaskToScope); + } +} + +bool DRFScopedNoAliasImpl::populateTaskScopeNoAlias() { + // Create a domain for the task scopes. + MDBuilder MDB(F.getContext()); + if (TI.isSerial()) + return false; + + DenseMap TaskToDomain; + DenseMap TaskToScope; + + // Create a domain and scope for the root task. + MDNode *NewDomain = + MDB.createAnonymousAliasScopeDomain(("dom_" + F.getName()).str()); + TaskToDomain[TI.getRootTask()] = NewDomain; + MDNode *NewScope = + MDB.createAnonymousAliasScope(NewDomain, ("scp_" + F.getName()).str()); + TaskToScope[TI.getRootTask()] = NewScope; + + // Recursively create task domains and scopes for subtasks. + createTaskDomainsAndFullScopes(TI.getRootTask(), MDB, F.getName(), + TaskToDomain, TaskToScope); + + SmallVector Scopes, NoAlias; + return populateSubTaskScopeNoAlias(TI.getRootTask(), MDB, Scopes, NoAlias, + TaskToScope); +} + +bool DRFScopedNoAliasImpl::run() { + return populateTaskScopeNoAlias(); +} + +bool DRFScopedNoAliasWrapperPass::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + TaskInfo &TI = getAnalysis().getTaskInfo(); + AliasAnalysis &AA = getAnalysis().getAAResults(); + LoopInfo &LI = getAnalysis().getLoopInfo(); + return DRFScopedNoAliasImpl(F, TI, AA, &LI).run(); +} + +// createDRFScopedNoAliasPass - Provide an entry point to create this pass. +// +namespace llvm { +FunctionPass *createDRFScopedNoAliasWrapperPass() { + return new DRFScopedNoAliasWrapperPass(); +} +} // end namespace llvm + +PreservedAnalyses DRFScopedNoAliasPass::run(Function &F, + FunctionAnalysisManager &AM) { + TaskInfo &TI = AM.getResult(F); + AliasAnalysis &AA = AM.getResult(F); + LoopInfo &LI = AM.getResult(F); + + DRFScopedNoAliasImpl(F, TI, AA, &LI).run(); + + PreservedAnalyses PA; + PA.preserve(); + PA.preserve(); + PA.preserve(); + return PA; +} diff --git a/llvm/lib/Transforms/Tapir/LambdaABI.cpp b/llvm/lib/Transforms/Tapir/LambdaABI.cpp new file mode 100644 index 000000000000000..0ec5e900c7559e1 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/LambdaABI.cpp @@ -0,0 +1,575 @@ +//===- LambdaABI.cpp - Generic interface to various runtime systems--------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the Lambda ABI to convert Tapir instructions to calls +// into a generic runtime system to operates on spawned computations as lambdas. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/LambdaABI.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Verifier.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ModRef.h" +#include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/EscapeEnumerator.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "lambdaabi" + +extern cl::opt DebugABICalls; + +static cl::opt + ClRuntimeBCPath("tapir-runtime-bc-path", cl::init(""), + cl::desc("Path to the bitcode file for the runtime ABI"), + cl::Hidden); + +static const StringRef StackFrameName = "__rts_sf"; + +namespace { + +// Custom DiagnosticInfo for linking the Lambda ABI bitcode file. +class LambdaABILinkDiagnosticInfo : public DiagnosticInfo { + const Module *SrcM; + const Twine &Msg; + +public: + LambdaABILinkDiagnosticInfo(DiagnosticSeverity Severity, const Module *SrcM, + const Twine &Msg) + : DiagnosticInfo(DK_Lowering, Severity), SrcM(SrcM), Msg(Msg) {} + void print(DiagnosticPrinter &DP) const override { + DP << "linking module '" << SrcM->getModuleIdentifier() << "': " << Msg; + } +}; + +// Custom DiagnosticHandler to handle diagnostics arising when linking the +// Lambda ABI bitcode file. +class LambdaABIDiagnosticHandler final : public DiagnosticHandler { + const Module *SrcM; + DiagnosticHandler *OrigHandler; + +public: + LambdaABIDiagnosticHandler(const Module *SrcM, DiagnosticHandler *OrigHandler) + : SrcM(SrcM), OrigHandler(OrigHandler) {} + + bool handleDiagnostics(const DiagnosticInfo &DI) override { + if (DI.getKind() != DK_Linker) + return OrigHandler->handleDiagnostics(DI); + + std::string MsgStorage; + { + raw_string_ostream Stream(MsgStorage); + DiagnosticPrinterRawOStream DP(Stream); + DI.print(DP); + } + return OrigHandler->handleDiagnostics( + LambdaABILinkDiagnosticInfo(DI.getSeverity(), SrcM, MsgStorage)); + } +}; + +// Structure recording information about runtime ABI functions. +struct RTSFnDesc { + StringRef FnName; + FunctionType *FnType; + FunctionCallee &FnCallee; +}; +} // namespace + +// void LambdaABI::setOptions(const TapirTargetOptions &Options) { +// if (!isa(Options)) +// return; + +// const LambdaABIOptions &OptionsCast = cast(Options); + +// // Get the path to the runtime bitcode file. +// RuntimeBCPath = OptionsCast.getRuntimeBCPath(); +// } + +void LambdaABI::prepareModule() { + LLVMContext &C = M.getContext(); + const DataLayout &DL = DestM.getDataLayout(); + Type *Int8Ty = Type::getInt8Ty(C); + Type *Int16Ty = Type::getInt16Ty(C); + Type *Int32Ty = Type::getInt32Ty(C); + Type *Int64Ty = Type::getInt64Ty(C); + + // If a runtime bitcode path is given via the command line, use it. + if ("" != ClRuntimeBCPath) + RuntimeBCPath = ClRuntimeBCPath; + + if ("" == RuntimeBCPath) { + C.emitError("LambdaABI: No bitcode ABI file given."); + return; + } + + LLVM_DEBUG(dbgs() << "Using external bitcode file for Lambda ABI: " + << RuntimeBCPath << "\n"); + SMDiagnostic SMD; + + // Parse the bitcode file. This call imports structure definitions, but not + // function definitions. + if (std::unique_ptr ExternalModule = + parseIRFile(RuntimeBCPath, SMD, C)) { + // Get the original DiagnosticHandler for this context. + std::unique_ptr OrigDiagHandler = + C.getDiagnosticHandler(); + + // Setup an LambdaABIDiagnosticHandler for this context, to handle + // diagnostics that arise from linking ExternalModule. + C.setDiagnosticHandler(std::make_unique( + ExternalModule.get(), OrigDiagHandler.get())); + + // Link the external module into the current module, copying over global + // values. + // + // TODO: Consider restructuring the import process to use + // Linker::Flags::LinkOnlyNeeded to copy over only the necessary contents + // from the external module. + bool Fail = Linker::linkModules( + M, std::move(ExternalModule), Linker::Flags::None, + [](Module &M, const StringSet<> &GVS) { + for (StringRef GVName : GVS.keys()) { + LLVM_DEBUG(dbgs() << "Linking global value " << GVName << "\n"); + if (Function *Fn = M.getFunction(GVName)) { + if (!Fn->isDeclaration() && !Fn->hasComdat()) + // We set the function's linkage as available_externally, so + // that subsequent optimizations can remove these definitions + // from the module. We don't want this module redefining any of + // these symbols, even if they aren't inlined, because the + // Lambda runtime library will provide those definitions later. + Fn->setLinkage(Function::AvailableExternallyLinkage); + } else if (GlobalVariable *G = M.getGlobalVariable(GVName)) { + if (!G->isDeclaration() && !G->hasComdat()) + G->setLinkage(GlobalValue::AvailableExternallyLinkage); + } + } + }); + if (Fail) + C.emitError("LambdaABI: Failed to link bitcode ABI file: " + + Twine(RuntimeBCPath)); + + // Restore the original DiagnosticHandler for this context. + C.setDiagnosticHandler(std::move(OrigDiagHandler)); + } else { + C.emitError("LambdaABI: Failed to parse bitcode ABI file: " + + Twine(RuntimeBCPath)); + } + + // Get or create local definitions of RTS structure types. + const char *StackFrameName = "struct.__rts_stack_frame"; + StackFrameTy = StructType::lookupOrCreate(C, StackFrameName); + + PointerType *StackFramePtrTy = PointerType::getUnqual(StackFrameTy); + Type *VoidTy = Type::getVoidTy(C); + Type *VoidPtrTy = Type::getInt8PtrTy(C); + + // Define the types of the RTS functions. + FunctionType *RTSFnTy = FunctionType::get(VoidTy, {StackFramePtrTy}, false); + SpawnBodyFnArgTy = VoidPtrTy; + Type *IntPtrTy = DL.getIntPtrType(C); + SpawnBodyFnArgSizeTy = IntPtrTy; + SpawnBodyFnTy = FunctionType::get(VoidTy, {SpawnBodyFnArgTy}, false); + FunctionType *SpawnFnTy = + FunctionType::get(VoidTy, + {StackFramePtrTy, PointerType::getUnqual(SpawnBodyFnTy), + SpawnBodyFnArgTy, SpawnBodyFnArgSizeTy, IntPtrTy}, + false); + FunctionType *Grainsize8FnTy = FunctionType::get(Int8Ty, {Int8Ty}, false); + FunctionType *Grainsize16FnTy = FunctionType::get(Int16Ty, {Int16Ty}, false); + FunctionType *Grainsize32FnTy = FunctionType::get(Int32Ty, {Int32Ty}, false); + FunctionType *Grainsize64FnTy = FunctionType::get(Int64Ty, {Int64Ty}, false); + FunctionType *WorkerInfoTy = FunctionType::get(Int32Ty, {}, false); + + // Create an array of RTS functions, with their associated types and + // FunctionCallee member variables in the LambdaABI class. + RTSFnDesc RTSFunctions[] = { + {"__rts_enter_frame", RTSFnTy, RTSEnterFrame}, + {"__rts_spawn", SpawnFnTy, RTSSpawn}, + {"__rts_leave_frame", RTSFnTy, RTSLeaveFrame}, + {"__rts_sync", RTSFnTy, RTSSync}, + {"__rts_sync_nothrow", RTSFnTy, RTSSyncNoThrow}, + {"__rts_loop_grainsize_8", Grainsize8FnTy, RTSLoopGrainsize8}, + {"__rts_loop_grainsize_16", Grainsize16FnTy, RTSLoopGrainsize16}, + {"__rts_loop_grainsize_32", Grainsize32FnTy, RTSLoopGrainsize32}, + {"__rts_loop_grainsize_64", Grainsize64FnTy, RTSLoopGrainsize64}, + {"__rts_get_num_workers", WorkerInfoTy, RTSGetNumWorkers}, + {"__rts_get_worker_id", WorkerInfoTy, RTSGetWorkerID}, + }; + + // Add attributes to internalized functions. + for (RTSFnDesc FnDesc : RTSFunctions) { + assert(!FnDesc.FnCallee && "Redefining RTS function"); + FnDesc.FnCallee = M.getOrInsertFunction(FnDesc.FnName, FnDesc.FnType); + assert(isa(FnDesc.FnCallee.getCallee()) && + "Runtime function is not a function"); + Function *Fn = cast(FnDesc.FnCallee.getCallee()); + + Fn->setDoesNotThrow(); + + // Unless we're debugging, mark the function as always_inline. This + // attribute is required for some functions, but is helpful for all + // functions. + if (!DebugABICalls) + Fn->addFnAttr(Attribute::AlwaysInline); + else + Fn->removeFnAttr(Attribute::AlwaysInline); + + if (Fn->getName() == "__rts_get_num_workers" || + Fn->getName() == "__rts_get_worker_id") { + Fn->setLinkage(Function::InternalLinkage); + } + } + + // If no valid bitcode file was found fill in the missing pieces. + // An error should have been emitted already unless the user + // set DebugABICalls. + + if (StackFrameTy->isOpaque()) { + // TODO: Figure out better handling of this potential error. + LLVM_DEBUG(dbgs() << "LambdaABI: Failed to find __rts_stack_frame type.\n"); + // Create a dummy __rts_stack_frame structure + StackFrameTy->setBody(Int64Ty); + } + // Create declarations of all RTS functions, and add basic attributes to those + // declarations. + for (RTSFnDesc FnDesc : RTSFunctions) { + if (FnDesc.FnCallee) + continue; + FnDesc.FnCallee = M.getOrInsertFunction(FnDesc.FnName, FnDesc.FnType); + assert(isa(FnDesc.FnCallee.getCallee()) && + "RTS function is not a function"); + Function *Fn = cast(FnDesc.FnCallee.getCallee()); + + Fn->setDoesNotThrow(); + } +} + +void LambdaABI::addHelperAttributes(Function &Helper) { + // Inlining the helper function is not legal. + Helper.removeFnAttr(Attribute::AlwaysInline); + Helper.addFnAttr(Attribute::NoInline); + // If the helper uses an argument structure, then it is not a write-only + // function. + if (getArgStructMode() != ArgStructMode::None) { + Helper.removeFnAttr(Attribute::WriteOnly); + Helper.setMemoryEffects( + MemoryEffects(MemoryEffects::Location::Other, ModRefInfo::ModRef)); + } + // Note that the address of the helper is unimportant. + Helper.setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + // The helper is internal to this module. We use internal linkage, rather + // than private linkage, so that tools can still reference the helper + // function. + Helper.setLinkage(GlobalValue::InternalLinkage); +} + +// Check whether the allocation of a __rts_stack_frame can be inserted after +// instruction \p I. +static bool skipInstruction(const Instruction &I) { + if (isa(I)) + return true; + + if (isa(I)) + return true; + + if (const IntrinsicInst *II = dyn_cast(&I)) { + // Skip simple intrinsics + switch (II->getIntrinsicID()) { + case Intrinsic::annotation: + case Intrinsic::assume: + case Intrinsic::sideeffect: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: + case Intrinsic::is_constant: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::objectsize: + case Intrinsic::ptr_annotation: + case Intrinsic::var_annotation: + case Intrinsic::experimental_gc_result: + case Intrinsic::experimental_gc_relocate: + case Intrinsic::experimental_noalias_scope_decl: + case Intrinsic::syncregion_start: + case Intrinsic::taskframe_create: + return true; + default: + return false; + } + } + + return false; +} + +// Scan the basic block \p B to find a point to insert the allocation of a +// __rts_stack_frame. +static Instruction *getStackFrameInsertPt(BasicBlock &B) { + BasicBlock::iterator BI(B.getFirstInsertionPt()); + BasicBlock::const_iterator BE(B.end()); + + // Scan the basic block for the first instruction we should not skip. + while (BI != BE) { + if (!skipInstruction(*BI)) { + return &*BI; + } + ++BI; + } + + // We reached the end of the basic block; return the terminator. + return B.getTerminator(); +} + +/// Create the __rts_stack_frame for the spawning function. +Value *LambdaABI::CreateStackFrame(Function &F) { + const DataLayout &DL = F.getParent()->getDataLayout(); + Type *SFTy = StackFrameTy; + + IRBuilder<> B(getStackFrameInsertPt(F.getEntryBlock())); + AllocaInst *SF = B.CreateAlloca(SFTy, DL.getAllocaAddrSpace(), + /*ArraySize*/ nullptr, + /*Name*/ StackFrameName); + + SF->setAlignment(StackFrameAlign); + + return SF; +} + +Value *LambdaABI::GetOrCreateStackFrame(Function &F) { + if (DetachCtxToStackFrame.count(&F)) + return DetachCtxToStackFrame[&F]; + + Value *SF = CreateStackFrame(F); + DetachCtxToStackFrame[&F] = SF; + + return SF; +} + +// Insert a call in Function F to __rts_enter_frame to initialize the +// __rts_stack_frame in F. If TaskFrameCreate is nonnull, the call to +// __rts_enter_frame is inserted at TaskFrameCreate. +CallInst *LambdaABI::InsertStackFramePush(Function &F, + Instruction *TaskFrameCreate, + bool Helper) { + Instruction *SF = cast(GetOrCreateStackFrame(F)); + + BasicBlock::iterator InsertPt = ++SF->getIterator(); + IRBuilder<> B(&(F.getEntryBlock()), InsertPt); + if (TaskFrameCreate) + B.SetInsertPoint(TaskFrameCreate); + if (!B.getCurrentDebugLocation()) { + // Try to find debug information later in this block for the ABI call. + BasicBlock::iterator BI = B.GetInsertPoint(); + BasicBlock::const_iterator BE(B.GetInsertBlock()->end()); + while (BI != BE) { + if (DebugLoc Loc = BI->getDebugLoc()) { + B.SetCurrentDebugLocation(Loc); + break; + } + ++BI; + } + } + + Value *Args[1] = {SF}; + return B.CreateCall(RTSEnterFrame, Args); +} + +// Insert a call in Function F to pop the stack frame. +// +// PromoteCallsToInvokes dictates whether call instructions that can throw are +// promoted to invoke instructions prior to inserting the epilogue-function +// calls. +void LambdaABI::InsertStackFramePop(Function &F, bool PromoteCallsToInvokes, + bool InsertPauseFrame, bool Helper) { + Value *SF = GetOrCreateStackFrame(F); + SmallPtrSet Returns; + SmallPtrSet Resumes; + + // Add eh cleanup that returns control to the runtime + EscapeEnumerator EE(F, "rts_cleanup", PromoteCallsToInvokes); + while (IRBuilder<> *Builder = EE.Next()) { + if (ResumeInst *RI = dyn_cast(Builder->GetInsertPoint())) { + if (!RI->getDebugLoc()) + // Attempt to set the debug location of this resume to match one of the + // preceeding terminators. + for (const BasicBlock *Pred : predecessors(RI->getParent())) + if (const DebugLoc &Loc = Pred->getTerminator()->getDebugLoc()) { + RI->setDebugLoc(Loc); + break; + } + Resumes.insert(RI); + } else if (ReturnInst *RI = dyn_cast(Builder->GetInsertPoint())) + Returns.insert(RI); + } + + for (ReturnInst *RI : Returns) { + CallInst::Create(RTSLeaveFrame, {SF}, "", RI) + ->setDebugLoc(RI->getDebugLoc()); + } +} + +/// Lower a call to get the grainsize of a Tapir loop. +Value *LambdaABI::lowerGrainsizeCall(CallInst *GrainsizeCall) { + Value *Limit = GrainsizeCall->getArgOperand(0); + IRBuilder<> Builder(GrainsizeCall); + + // Select the appropriate __rts_grainsize function, based on the type. + FunctionCallee RTSGrainsizeCall; + if (GrainsizeCall->getType()->isIntegerTy(8)) + RTSGrainsizeCall = RTSLoopGrainsize8; + else if (GrainsizeCall->getType()->isIntegerTy(16)) + RTSGrainsizeCall = RTSLoopGrainsize16; + else if (GrainsizeCall->getType()->isIntegerTy(32)) + RTSGrainsizeCall = RTSLoopGrainsize32; + else if (GrainsizeCall->getType()->isIntegerTy(64)) + RTSGrainsizeCall = RTSLoopGrainsize64; + else + llvm_unreachable("No RTSGrainsize call matches type for Tapir loop."); + + Value *Grainsize = Builder.CreateCall(RTSGrainsizeCall, Limit); + + // Replace uses of grainsize intrinsic call with this grainsize value. + GrainsizeCall->replaceAllUsesWith(Grainsize); + return Grainsize; +} + +// Lower a sync instruction SI. +void LambdaABI::lowerSync(SyncInst &SI) { + Function &Fn = *SI.getFunction(); + if (!DetachCtxToStackFrame[&Fn]) + // If we have not created a stackframe for this function, then we don't need + // to handle the sync. + return; + + Value *SF = GetOrCreateStackFrame(Fn); + Value *Args[] = {SF}; + assert(Args[0] && "sync used in function without frame!"); + + Instruction *SyncUnwind = nullptr; + BasicBlock *SyncCont = SI.getSuccessor(0); + BasicBlock *SyncUnwindDest = nullptr; + // Determine whether a sync.unwind immediately follows SI. + if (InvokeInst *II = + dyn_cast(SyncCont->getFirstNonPHIOrDbgOrLifetime())) { + if (isSyncUnwind(II)) { + SyncUnwind = II; + SyncCont = II->getNormalDest(); + SyncUnwindDest = II->getUnwindDest(); + } + } + + CallBase *CB; + if (!SyncUnwindDest) { + if (Fn.doesNotThrow()) + CB = CallInst::Create(RTSSyncNoThrow, Args, "", + /*insert before*/ &SI); + else + CB = CallInst::Create(RTSSync, Args, "", /*insert before*/ &SI); + + BranchInst::Create(SyncCont, CB->getParent()); + } else { + CB = InvokeInst::Create(RTSSync, SyncCont, SyncUnwindDest, Args, "", + /*insert before*/ &SI); + for (PHINode &PN : SyncCont->phis()) + PN.addIncoming(PN.getIncomingValueForBlock(SyncUnwind->getParent()), + SI.getParent()); + for (PHINode &PN : SyncUnwindDest->phis()) + PN.addIncoming(PN.getIncomingValueForBlock(SyncUnwind->getParent()), + SI.getParent()); + } + CB->setDebugLoc(SI.getDebugLoc()); + SI.eraseFromParent(); +} + +bool LambdaABI::preProcessFunction(Function &F, TaskInfo &TI, + bool ProcessingTapirLoops) { + return false; +} +void LambdaABI::postProcessFunction(Function &F, bool ProcessingTapirLoops) {} +void LambdaABI::postProcessHelper(Function &F) {} + +void LambdaABI::preProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, + bool IsSpawner, BasicBlock *TFEntry) { + if (IsSpawner) + InsertStackFramePush(F, TaskFrameCreate, /*Helper*/ true); +} + +void LambdaABI::postProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, + bool IsSpawner, BasicBlock *TFEntry) { + if (IsSpawner) + InsertStackFramePop(F, /*PromoteCallsToInvokes*/ true, + /*InsertPauseFrame*/ true, /*Helper*/ true); +} + +void LambdaABI::preProcessRootSpawner(Function &F, BasicBlock *TFEntry) { + InsertStackFramePush(F); +} + +void LambdaABI::postProcessRootSpawner(Function &F, BasicBlock *TFEntry) { + InsertStackFramePop(F, /*PromoteCallsToInvokes*/ false, + /*InsertPauseFrame*/ false, /*Helper*/ false); +} + +void LambdaABI::processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) { + const DataLayout &DL = DestM.getDataLayout(); + CallBase *ReplCall = cast(TOI.ReplCall); + + Function &F = *ReplCall->getFunction(); + Value *SF = DetachCtxToStackFrame[&F]; + assert(SF && "No frame found for spawning task"); + + // Get the alignment of the helper arguments. The bitcode-ABI functions may + // use the alignment to align the shared variables in the storage allocated by + // the OpenMP runtime, especially to accommodate vector arguments. + AllocaInst *ArgAlloca = cast(ReplCall->getArgOperand(0)); + uint64_t Alignment = + DL.getPrefTypeAlign(ArgAlloca->getAllocatedType()).value(); + + IRBuilder<> B(ReplCall); + Value *FnCast = B.CreateBitCast(ReplCall->getCalledFunction(), + PointerType::getUnqual(SpawnBodyFnTy)); + Value *ArgCast = + B.CreateBitOrPointerCast(ReplCall->getArgOperand(0), SpawnBodyFnArgTy); + auto ArgSize = + cast(ReplCall->getArgOperand(0))->getAllocationSizeInBits(DL); + assert(ArgSize && + "Could not determine size of compiler-generated ArgStruct."); + Value *ArgSizeVal = ConstantInt::get(SpawnBodyFnArgSizeTy, *ArgSize / 8); + + if (InvokeInst *II = dyn_cast(ReplCall)) { + B.CreateInvoke(RTSSpawn, II->getNormalDest(), II->getUnwindDest(), + {SF, FnCast, ArgCast, ArgSizeVal, B.getInt64(Alignment)}); + } else { + B.CreateCall(RTSSpawn, + {SF, FnCast, ArgCast, ArgSizeVal, B.getInt64(Alignment)}); + } + + ReplCall->eraseFromParent(); +} diff --git a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp new file mode 100644 index 000000000000000..5a3fbb8eb083345 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp @@ -0,0 +1,1767 @@ +//===- LoopSpawningTI.cpp - Spawn loop iterations efficiently -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Modify Tapir loops to spawn their iterations efficiently. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/LoopSpawningTI.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Timer.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/IndVarSimplify.h" +#include "llvm/Transforms/Scalar/SimplifyCFG.h" +#include "llvm/Transforms/Scalar/LoopDeletion.h" +#include "llvm/Transforms/Tapir.h" +#include "llvm/Transforms/Tapir/LoweringUtils.h" +#include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/Transforms/Tapir/TapirLoopInfo.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/EscapeEnumerator.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/TapirUtils.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include + +using namespace llvm; + +#define LS_NAME "loop-spawning-ti" +#define DEBUG_TYPE LS_NAME + +STATISTIC(TapirLoopsFound, + "Number of Tapir loops discovered spawning"); +STATISTIC(LoopsConvertedToDAC, + "Number of Tapir loops converted to divide-and-conquer iteration " + "spawning"); + +static const char TimerGroupName[] = DEBUG_TYPE; +static const char TimerGroupDescription[] = "Loop spawning"; + +/// The default loop-outline processor leaves the outlined Tapir loop as is. +class DefaultLoopOutlineProcessor : public LoopOutlineProcessor { +public: + DefaultLoopOutlineProcessor(Module &M) : LoopOutlineProcessor(M) {} + void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, + ValueToValueMapTy &VMap) override final { + LoopOutlineProcessor::postProcessOutline(TL, Out, VMap); + addSyncToOutlineReturns(TL, Out, VMap); + } +}; + +/// The DACSpawning loop-outline processor transforms an outlined Tapir loop to +/// evaluate the iterations using parallel recursive divide-and-conquer. +class DACSpawning : public LoopOutlineProcessor { +public: + DACSpawning(Module &M) : LoopOutlineProcessor(M) {} + void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, + ValueToValueMapTy &VMap) override final { + LoopOutlineProcessor::postProcessOutline(TL, Out, VMap); + implementDACIterSpawnOnHelper(TL, Out, VMap); + ++LoopsConvertedToDAC; + + // Move Cilksan instrumentation. + moveCilksanInstrumentation(TL, Out, VMap); + + // Add syncs to all exits of the outline. + addSyncToOutlineReturns(TL, Out, VMap); + } + +private: + void implementDACIterSpawnOnHelper( + TapirLoopInfo &TL, TaskOutlineInfo &Out, ValueToValueMapTy &VMap); +}; + +static bool isSRetInput(const Value *V, const Function &F) { + if (!isa(V)) + return false; + + auto ArgIter = F.arg_begin(); + if (F.hasParamAttribute(0, Attribute::StructRet) && V == &*ArgIter) + return true; + ++ArgIter; + if (F.hasParamAttribute(1, Attribute::StructRet) && V == &*ArgIter) + return true; + + return false; +} + +void LoopOutlineProcessor::setupLoopOutlineArgs( + Function &F, ValueSet &HelperArgs, SmallVectorImpl &HelperInputs, + ValueSet &InputSet, const SmallVectorImpl &LCArgs, + const SmallVectorImpl &LCInputs, const ValueSet &TLInputsFixed) { + // Add Tapir-loop inputs to vectors for args and helpers. + // + // First add the sret task input, if it exists. + ValueSet::iterator TLInputIter = TLInputsFixed.begin(); + if ((TLInputIter != TLInputsFixed.end()) && isSRetInput(*TLInputIter, F)) { + HelperArgs.insert(*TLInputIter); + HelperInputs.push_back(*TLInputIter); + ++TLInputIter; + } + + // Then add the loop control inputs. + for (Value *V : LCArgs) + HelperArgs.insert(V); + for (Value *V : LCInputs) { + HelperInputs.push_back(V); + // Add all loop-control inputs to the input set. + InputSet.insert(V); + } + + // Finally add the remaining inputs + while (TLInputIter != TLInputsFixed.end()) { + Value *V = *TLInputIter++; + assert(!HelperArgs.count(V)); + HelperArgs.insert(V); + HelperInputs.push_back(V); + } +} + +unsigned LoopOutlineProcessor::getIVArgIndex(const Function &F, + const ValueSet &Args) const { + // The argument for the primary induction variable is either the first or + // second input, depending on whether there is an sret input. + unsigned IVArgOffset = 0; + if (isSRetInput(Args[IVArgOffset], F)) + ++IVArgOffset; + return IVArgOffset; +} + +void LoopOutlineProcessor::postProcessOutline(TapirLoopInfo &TL, + TaskOutlineInfo &Out, + ValueToValueMapTy &VMap) { + Function *Helper = Out.Outline; + // Use a fast calling convention for the helper. + Helper->setCallingConv(CallingConv::Fast); + // Note that the address of the helper is unimportant. + Helper->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + // The helper is internal to this module. + Helper->setLinkage(GlobalValue::InternalLinkage); +} + +void LoopOutlineProcessor::addSyncToOutlineReturns(TapirLoopInfo &TL, + TaskOutlineInfo &Out, + ValueToValueMapTy &VMap) { + Value *SyncRegion = + cast(VMap[TL.getTask()->getDetach()->getSyncRegion()]); + EscapeEnumerator EE(*Out.Outline, "ls.sync", false); + while (IRBuilder<> *AtExit = EE.Next()) { + // TODO: Add an option to insert syncs before resumes. + if (!isa(*AtExit->GetInsertPoint())) + continue; + + BasicBlock *Exit = AtExit->GetInsertBlock(); + BasicBlock *NewExit = SplitBlock(Exit, Exit->getTerminator()); + SyncInst *NewSync = SyncInst::Create(NewExit, SyncRegion); + ReplaceInstWithInst(Exit->getTerminator(), NewSync); + + // If the helper does not throw, there's no need to insert a sync.unwind. + if (Out.Outline->doesNotThrow()) + return; + + // Insert a call to sync.unwind. + CallInst *SyncUnwind = CallInst::Create( + Intrinsic::getDeclaration(&M, Intrinsic::sync_unwind), + { SyncRegion }, "", NewExit->getFirstNonPHIOrDbg()); + // If the Tapir loop has an unwind destination, change the sync.unwind to an + // invoke that unwinds to the cloned unwind destination. + if (TL.getUnwindDest()) + changeToInvokeAndSplitBasicBlock( + SyncUnwind, cast(VMap[TL.getUnwindDest()])); + } +} + +static void getDependenciesInSameBlock(Instruction *I, + SmallPtrSetImpl &Deps) { + const BasicBlock *Block = I->getParent(); + for (Value *Op : I->operand_values()) + if (Instruction *OpI = dyn_cast(Op)) + if (OpI->getParent() == Block) { + if (!Deps.insert(OpI).second) + continue; + getDependenciesInSameBlock(OpI, Deps); + } +} + +static void moveInstrumentation(StringRef Name, BasicBlock &From, + BasicBlock &To, + Instruction *InsertBefore = nullptr) { + assert((!InsertBefore || InsertBefore->getParent() == &To) && + "Insert point not in To block."); + BasicBlock::iterator InsertPoint = + InsertBefore ? InsertBefore->getIterator() : To.getFirstInsertionPt(); + + // Search the From block for instrumentation to move. + SmallPtrSet ToHoist; + for (Instruction &I : From) { + if (CallBase *CB = dyn_cast(&I)) + if (const Function *Called = CB->getCalledFunction()) + if (Called->getName() == Name) { + ToHoist.insert(&I); + getDependenciesInSameBlock(&I, ToHoist); + } + } + + // If we found no instrumentation to hoist, give up. + if (ToHoist.empty()) + return; + + // Hoist the instrumentation to InsertPoint in the To block. + for (BasicBlock::iterator II = From.begin(), IE = From.end(); II != IE;) { + Instruction *I = dyn_cast(II++); + if (!I || !ToHoist.count(I)) + continue; + + while (isa(II) && ToHoist.count(cast(II))) + ++II; + + To.splice(InsertPoint, &From, I->getIterator(), II); + } +} + +void LoopOutlineProcessor::moveCilksanInstrumentation(TapirLoopInfo &TL, + TaskOutlineInfo &Out, + ValueToValueMapTy &VMap) { + Task *T = TL.getTask(); + Loop *L = TL.getLoop(); + + // Get the header of the cloned loop. + BasicBlock *Header = cast(VMap[L->getHeader()]); + assert(Header && "No cloned header block found"); + + // Get the task entry of the cloned loop. + BasicBlock *TaskEntry = cast(VMap[T->getEntry()]); + assert(TaskEntry && "No cloned task-entry block found"); + + // Get the latch of the cloned loop. + BasicBlock *Latch = cast(VMap[L->getLoopLatch()]); + assert(Latch && "No cloned loop latch found"); + + // Get the normal task exit of the cloned loop. + BasicBlock *TaskExit = Latch->getSinglePredecessor(); + + // Get the preheader of the cloned loop. + BasicBlock *Preheader = nullptr; + for (BasicBlock *Pred : predecessors(Header)) { + if (Latch == Pred) + continue; + Preheader = Pred; + break; + } + if (!Preheader) { + LLVM_DEBUG(dbgs() << "No preheader for hoisting Cilksan instrumentation\n"); + return; + } + + // Get the normal exit of the cloned loop. + BasicBlock *LatchExit = nullptr; + for (BasicBlock *Succ : successors(Latch)) { + if (Header == Succ) + continue; + LatchExit = Succ; + break; + } + if (!LatchExit) { + LLVM_DEBUG( + dbgs() << "No normal exit for hoisting Cilksan instrumentation\n"); + return; + } + + // Move __csan_detach and __csan_task to the Preheader. + moveInstrumentation("__csan_detach", *Header, *Preheader, + Preheader->getTerminator()); + moveInstrumentation("__csan_task", *TaskEntry, *Preheader, + Preheader->getTerminator()); + + // Move __csan_detach_continue and __csan_task_exit on the normal exit path to + // LatchExit. + moveInstrumentation("__csan_detach_continue", *Latch, *LatchExit); + if (TaskExit) + // There's only one block with __csan_task_exit instrumentation to move, so + // move it from that block. + moveInstrumentation("__csan_task_exit", *TaskExit, *LatchExit); + else { + // We need to create PHI nodes for the arguments of a new instrumentation + // call in LatchExit. + + // Scan all predecessors of Latch for __csan_task_exit instrumentation. + DenseMap Instrumentation; + Function *InstrFunc = nullptr; + for (BasicBlock *Pred : predecessors(Latch)) + for (Instruction &I : *Pred) + if (CallBase *CB = dyn_cast(&I)) + if (Function *Called = CB->getCalledFunction()) + if (Called->getName() == "__csan_task_exit") { + Instrumentation.insert(std::make_pair(Pred, CB)); + InstrFunc = Called; + } + + // Return early if we found no instrumentation. + if (!InstrFunc || Instrumentation.empty()) { + LLVM_DEBUG(dbgs() << "No task_exit instrumentation found"); + return; + } + + // Create PHI nodes at the start of Latch for the arguments of the moved + // instrumentation. + SmallVector InstrArgs; + for (BasicBlock *Pred : predecessors(Latch)) { + CallBase *Instr = Instrumentation[Pred]; + if (InstrArgs.empty()) { + // Create PHI nodes at the start of Latch for the instrumentation + // arguments. + IRBuilder<> IRB(&Latch->front()); + for (Value *Arg : Instr->args()) { + PHINode *ArgPHI = + IRB.CreatePHI(Arg->getType(), Instrumentation.size()); + ArgPHI->addIncoming(Arg, Pred); + InstrArgs.push_back(ArgPHI); + } + } else { + // Update the PHI nodes at the start of Latch for the instrumentation. + unsigned ArgIdx = 0; + for (Value *Arg : Instr->args()) { + cast(InstrArgs[ArgIdx])->addIncoming(Arg, Pred); + ++ArgIdx; + } + } + } + + // Insert new instrumentation call at the start of LatchExit. + CallInst::Create(InstrFunc->getFunctionType(), InstrFunc, InstrArgs, "", + &*LatchExit->getFirstInsertionPt()); + + // Remove old instrumentation calls from predecessors + for (BasicBlock *Pred : predecessors(Latch)) + Instrumentation[Pred]->eraseFromParent(); + } +} + +namespace { +static void emitMissedWarning(const Loop *L, const TapirLoopHints &LH, + OptimizationRemarkEmitter *ORE) { + switch (LH.getStrategy()) { + case TapirLoopHints::ST_DAC: + ORE->emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "FailedRequestedSpawning", + L->getStartLoc(), L->getHeader()) + << "Tapir loop not transformed: " + << "failed to use divide-and-conquer loop spawning." + << " Compile with -Rpass-analysis=" << LS_NAME + << " for more details."); + break; + case TapirLoopHints::ST_SEQ: + ORE->emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "SpawningDisabled", + L->getStartLoc(), L->getHeader()) + << "Tapir loop not transformed: " + << "loop-spawning transformation disabled"); + break; + case TapirLoopHints::ST_END: + ORE->emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "FailedRequestedSpawning", + L->getStartLoc(), L->getHeader()) + << "Tapir loop not transformed: " + << "unknown loop-spawning strategy"); + break; + } +} + +/// Process Tapir loops within the given function for loop spawning. +class LoopSpawningImpl { +public: + LoopSpawningImpl(Function &F, DominatorTree &DT, LoopInfo &LI, TaskInfo &TI, + ScalarEvolution &SE, AssumptionCache &AC, + TargetTransformInfo &TTI, TapirTarget *Target, + OptimizationRemarkEmitter &ORE) + : F(F), DT(DT), LI(LI), TI(TI), SE(SE), AC(AC), TTI(TTI), Target(Target), + ORE(ORE) {} + + ~LoopSpawningImpl() { + for (TapirLoopInfo *TL : TapirLoops) + delete TL; + TapirLoops.clear(); + TaskToTapirLoop.clear(); + LoopToTapirLoop.clear(); + } + + bool run(); + + // If loop \p L defines a recorded Tapir loop, returns the Tapir loop info for + // that Tapir loop. Otherwise returns null. + TapirLoopInfo *getTapirLoop(Loop *L) { + if (!LoopToTapirLoop.count(L)) + return nullptr; + return LoopToTapirLoop[L]; + } + + // If task \p T defines a recorded Tapir loop, returns the Tapir loop info for + // that Tapir loop. Otherwise returns null. + TapirLoopInfo *getTapirLoop(Task *T) { + if (!TaskToTapirLoop.count(T)) + return nullptr; + return TaskToTapirLoop[T]; + } + + // Gets the Tapir loop that contains basic block \p B, i.e., the Tapir loop + // for the loop associated with \p B. + TapirLoopInfo *getTapirLoop(const BasicBlock *B) { + return getTapirLoop(LI.getLoopFor(B)); + } + +private: + // Record a Tapir loop defined by loop \p L and task \p T. + TapirLoopInfo *createTapirLoop(Loop *L, Task *T) { + TapirLoops.push_back(new TapirLoopInfo(L, T, ORE)); + TaskToTapirLoop[T] = TapirLoops.back(); + LoopToTapirLoop[L] = TapirLoops.back(); + ++TapirLoopsFound; + return TapirLoops.back(); + } + + // Forget the specified Tapir loop \p TL. + void forgetTapirLoop(TapirLoopInfo *TL) { + TaskToTapirLoop.erase(TL->getTask()); + LoopToTapirLoop.erase(TL->getLoop()); + } + + // If loop \p L is a Tapir loop, return its corresponding task. + Task *getTaskIfTapirLoop(const Loop *L); + + // Get the LoopOutlineProcessor for handling Tapir loop \p TL. + LoopOutlineProcessor *getOutlineProcessor(TapirLoopInfo *TL); + + using LOPMapTy = DenseMap>; + + // For all recorded Tapir loops, determine the function arguments and inputs + // for the outlined helper functions for those loops. + // + // The \p LoopArgs map will store the function arguments for these outlined + // loop helpers. The \p LoopInputs map will store the corresponding arguments + // for calling those outlined helpers from the parent function. The \p + // LoopArgStarts map will store the instruction in the parent where new code + // for computing these outlined-helper-call arguments is first inserted. + void getAllTapirLoopInputs( + DenseMap &LoopInputSets, + DenseMap> &LoopCtlArgs, + DenseMap> &LoopCtlInputs); + + // Associate tasks with Tapir loops that enclose them. + void associateTasksToTapirLoops(); + + // Get the set of basic blocks within the task of Tapir loop \p TL. The \p + // TaskBlocks vector stores all of these basic blocks. The \p ReattachBlocks + // set identifies which blocks are terminated by a reattach instruction that + // terminates the task. The \p DetachedRethrowBlocks set identifies which + // blocks are terminated by detached-rethrow instructions that terminate the + // task. Entry points to shared exception-handling code is stored in the + // \p SharedEHEntries set. + // + // This method relies on being executed on the Tapir loops in a function in + // post order. + void getTapirLoopTaskBlocks( + TapirLoopInfo *TL, std::vector &TaskBlocks, + SmallPtrSetImpl &ReattachBlocks, + SmallPtrSetImpl &DetachedRethrowBlocks, + SmallPtrSetImpl &SharedEHEntries, + SmallPtrSetImpl &UnreachableExits); + + // Outline Tapir loop \p TL into a helper function. The \p Args set specified + // the arguments to that helper function. The map \p VMap will store the + // mapping of values in the original function to values in the outlined + // helper. + Function *createHelperForTapirLoop(TapirLoopInfo *TL, ValueSet &Args, + unsigned IVArgIndex, + unsigned LimitArgIndex, Module *DestM, + ValueToValueMapTy &VMap, + ValueToValueMapTy &InputMap); + + // Outline all recorded Tapir loops in the function. + TaskOutlineMapTy outlineAllTapirLoops(); + +private: + Function &F; + + DominatorTree &DT; + LoopInfo &LI; + TaskInfo &TI; + ScalarEvolution &SE; + AssumptionCache &AC; + TargetTransformInfo &TTI; + TapirTarget *Target; + OptimizationRemarkEmitter &ORE; + + std::vector TapirLoops; + DenseMap TaskToTapirLoop; + DenseMap LoopToTapirLoop; + LOPMapTy OutlineProcessors; +}; +} // end anonymous namespace + +// Set up a basic unwind for a detached task: +// +// callunwind: +// lpad = landingpad +// catch null +// invoke detached_rethrow(lpad), label unreachable, label detach_unwind +static BasicBlock *createTaskUnwind(Function *F, BasicBlock *UnwindDest, + Value *SyncRegion, const Twine &Name = "") { + Module *M = F->getParent(); + LLVMContext &Ctx = M->getContext(); + BasicBlock *CallUnwind = BasicBlock::Create(Ctx, Name, F); + + // Create the landing bad. + IRBuilder<> Builder(CallUnwind); + LandingPadInst *LPad = Builder.CreateLandingPad( + UnwindDest->getLandingPadInst()->getType(), 0); + LPad->setCleanup(true); + // Create the normal return for the detached rethrow. + BasicBlock *DRUnreachable = BasicBlock::Create( + Ctx, CallUnwind->getName()+".unreachable", F); + // Invoke the detached rethrow. + Builder.CreateInvoke( + Intrinsic::getDeclaration(M, Intrinsic::detached_rethrow, + { LPad->getType() }), + DRUnreachable, UnwindDest, { SyncRegion, LPad }); + + // Terminate the normal return of the detached rethrow with unreachable. + Builder.SetInsertPoint(DRUnreachable); + Builder.CreateUnreachable(); + + return CallUnwind; +} + +/// Implement the parallel loop control for a given outlined Tapir loop to +/// process loop iterations in a parallel recursive divide-and-conquer fashion. +void DACSpawning::implementDACIterSpawnOnHelper( + TapirLoopInfo &TL, TaskOutlineInfo &Out, ValueToValueMapTy &VMap) { + NamedRegionTimer NRT("implementDACIterSpawnOnHelper", + "Implement D&C spawning of loop iterations", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + Task *T = TL.getTask(); + Loop *L = TL.getLoop(); + + DebugLoc TLDebugLoc = cast(VMap[T->getDetach()])->getDebugLoc(); + Value *SyncRegion = cast(VMap[T->getDetach()->getSyncRegion()]); + Function *Helper = Out.Outline; + BasicBlock *Preheader = cast(VMap[L->getLoopPreheader()]); + + PHINode *PrimaryIV = cast(VMap[TL.getPrimaryInduction().first]); + + // Remove the norecurse attribute from Helper. + if (Helper->doesNotRecurse()) + Helper->removeFnAttr(Attribute::NoRecurse); + + // Convert the cloned loop into the strip-mined loop body. + assert(Preheader->getParent() == Helper && + "Preheader does not belong to helper function."); + assert(PrimaryIV->getParent()->getParent() == Helper && + "PrimaryIV does not belong to header"); + + // Get end and grainsize arguments + Argument *End, *Grainsize; + { + auto OutlineArgsIter = Helper->arg_begin(); + if (Helper->hasParamAttribute(0, Attribute::StructRet)) + ++OutlineArgsIter; + // End argument is second LC input. + End = &*++OutlineArgsIter; + // Grainsize argument is third LC input. + Grainsize = &*++OutlineArgsIter; + } + + BasicBlock *DACHead = Preheader; + if (&(Helper->getEntryBlock()) == Preheader) { + // Split the entry block. We'll want to create a backedge into + // the split block later. + DACHead = SplitBlock(Preheader, &Preheader->front()); + + // Move any syncregion_start's in DACHead into Preheader. + BasicBlock::iterator InsertPoint = Preheader->begin(); + for (BasicBlock::iterator I = DACHead->begin(), E = DACHead->end(); + I != E;) { + IntrinsicInst *II = dyn_cast(I++); + if (!II) + continue; + if (Intrinsic::syncregion_start != II->getIntrinsicID()) + continue; + + while (isa(I) && + Intrinsic::syncregion_start == + cast(I)->getIntrinsicID()) + ++I; + + Preheader->splice(InsertPoint, &*DACHead, II->getIterator(), I); + } + + if (!Preheader->getTerminator()->getDebugLoc()) + Preheader->getTerminator()->setDebugLoc( + DACHead->getTerminator()->getDebugLoc()); + } + + Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(DACHead); + Value *PrimaryIVInc = PrimaryIV->getIncomingValueForBlock( + cast(VMap[L->getLoopLatch()])); + + // At this point, DACHead is the preheader to the loop and is guaranteed to + // not be the function entry: + // + // DACHead: ; preds = %entry + // br label Header + // + // From this block, we first create the skeleton of the parallel D&C loop + // control: + // + // DACHead: + // PrimaryIVStart = phi ??? + // IterCount = sub End, PrimaryIVStart + // IterCountCmp = icmp ugt IterCount, Grainsize + // br i1 IterCountCmp, label RecurHead, label Header + // + // RecurHead: + // br label RecurDet + // + // RecurDet: + // br label RecurCont + // + // RecurCont: + // br label DACHead + BasicBlock *RecurHead, *RecurDet, *RecurCont; + Value *IterCount; + PHINode *PrimaryIVStart; + Value *Start; + { + Instruction *PreheaderOrigFront = &(DACHead->front()); + IRBuilder<> Builder(PreheaderOrigFront); + if (!Builder.getCurrentDebugLocation()) + Builder.SetCurrentDebugLocation( + Preheader->getTerminator()->getDebugLoc()); + // Create branch based on grainsize. + PrimaryIVStart = Builder.CreatePHI(PrimaryIV->getType(), 2, + PrimaryIV->getName()+".dac"); + PrimaryIVStart->setDebugLoc(PrimaryIV->getDebugLoc()); + PrimaryIVInput->replaceAllUsesWith(PrimaryIVStart); + Start = PrimaryIVStart; + // Extend or truncate start, if necessary. + if (PrimaryIVStart->getType() != End->getType()) + Start = Builder.CreateZExtOrTrunc(PrimaryIVStart, End->getType()); + IterCount = Builder.CreateSub(End, Start, "itercount"); + Value *IterCountCmp = Builder.CreateICmpUGT(IterCount, Grainsize); + Instruction *RecurTerm = + SplitBlockAndInsertIfThen(IterCountCmp, PreheaderOrigFront, + /*Unreachable=*/false, + /*BranchWeights=*/nullptr); + RecurHead = RecurTerm->getParent(); + // Create RecurHead, RecurDet, and RecurCont, with appropriate branches. + RecurDet = SplitBlock(RecurHead, RecurHead->getTerminator()); + RecurCont = SplitBlock(RecurDet, RecurDet->getTerminator()); + RecurCont->getTerminator()->replaceUsesOfWith(RecurTerm->getSuccessor(0), + DACHead); + } + + // Compute the mid iteration in RecurHead: + // + // RecurHead: + // %halfcount = lshr IterCount, 1 + // MidIter = add PrimaryIVStart, %halfcount + // br label RecurDet + Instruction *MidIter; + { + IRBuilder<> Builder(&(RecurHead->front())); + Value *HalfCount = Builder.CreateLShr(IterCount, 1, "halfcount"); + MidIter = cast(Builder.CreateAdd(Start, HalfCount, "miditer")); + // Copy flags from the increment operation on the primary IV. + MidIter->copyIRFlags(PrimaryIVInc); + } + + // Create a recursive call in RecurDet. If the call cannot throw, then + // RecurDet becomes: + // + // RecurDet: + // call Helper(..., PrimaryIVStart, MidIter, ...) + // br label RecurCont + // + // Otherwise an a new unwind destination, CallUnwind, is created or the + // invoke, and RecurDet becomes: + // + // RecurDet: + // invoke Helper(..., PrimaryIVStart, MidIter, ...) + // to label CallDest unwind label CallUnwind + // + // CallDest: + // br label RecurCont + BasicBlock *RecurCallDest = RecurDet; + BasicBlock *UnwindDest = nullptr; + if (TL.getUnwindDest()) + UnwindDest = cast(VMap[TL.getUnwindDest()]); + { + // Create input array for recursive call. + IRBuilder<> Builder(&(RecurDet->front())); + SmallVector RecurCallInputs; + for (Value &V : Helper->args()) { + // Only the inputs for the start and end iterations need special care. + // All other inputs should match the arguments of Helper. + if (&V == PrimaryIVInput) + RecurCallInputs.push_back(PrimaryIVStart); + else if (&V == End) + RecurCallInputs.push_back(MidIter); + else + RecurCallInputs.push_back(&V); + } + + if (!UnwindDest) { + // Common case. Insert a call to the outline immediately before the detach. + CallInst *RecurCall; + // Create call instruction. + RecurCall = Builder.CreateCall(Helper, RecurCallInputs); + // Use a fast calling convention for the outline. + RecurCall->setCallingConv(Helper->getCallingConv()); + RecurCall->setDebugLoc(TLDebugLoc); + if (Helper->doesNotThrow()) + RecurCall->setDoesNotThrow(); + } else { + InvokeInst *RecurCall; + BasicBlock *CallDest = SplitBlock(RecurDet, RecurDet->getTerminator()); + BasicBlock *CallUnwind = + createTaskUnwind(Helper, UnwindDest, SyncRegion, + RecurDet->getName()+".unwind"); + RecurCall = InvokeInst::Create(Helper, CallDest, CallUnwind, + RecurCallInputs); + // Use a fast calling convention for the outline. + RecurCall->setCallingConv(Helper->getCallingConv()); + RecurCall->setDebugLoc(TLDebugLoc); + ReplaceInstWithInst(RecurDet->getTerminator(), RecurCall); + RecurCallDest = CallDest; + } + } + + // Set up continuation of detached recursive call to compute the next loop + // iteration to execute. For inclusive ranges, this means adding one to + // MidIter: + // + // RecurCont: + // MidIterPlusOne = add MidIter, 1 + // br label DACHead + Instruction *NextIter = MidIter; + if (TL.isInclusiveRange()) { + IRBuilder<> Builder(&(RecurCont->front())); + NextIter = cast( + Builder.CreateAdd(MidIter, ConstantInt::get(End->getType(), 1), + "miditerplusone")); + // Copy flags from the increment operation on the primary IV. + NextIter->copyIRFlags(PrimaryIVInc); + // Extend or truncate NextIter, if necessary + if (PrimaryIVStart->getType() != NextIter->getType()) + NextIter = cast( + Builder.CreateZExtOrTrunc(NextIter, PrimaryIVStart->getType())); + } else if (PrimaryIVStart->getType() != NextIter->getType()) { + IRBuilder<> Builder(&(RecurCont->front())); + NextIter = cast( + Builder.CreateZExtOrTrunc(NextIter, PrimaryIVStart->getType())); + } + + // Finish the phi node in DACHead. + // + // DACHead: + // PrimaryIVStart = phi [ PrimaryIVInput, %entry ], [ NextIter, RecurCont ] + // ... + PrimaryIVStart->addIncoming(PrimaryIVInput, Preheader); + PrimaryIVStart->addIncoming(NextIter, RecurCont); + + // Make the recursive DAC call parallel. + // + // RecurHead: + // detach within SyncRegion, label RecurDet, label RecurCont + // (unwind label DetachUnwind) + // + // RecurDet: + // call Helper(...) + // reattach label RecurCont + // + // or + // + // RecurDet: + // invoke Helper(...) to CallDest unwind UnwindDest + // + // CallDest: + // reattach label RecurCont + { + IRBuilder<> Builder(RecurHead->getTerminator()); + // Create the detach. + DetachInst *NewDI; + if (!UnwindDest) + NewDI = Builder.CreateDetach(RecurDet, RecurCont, SyncRegion); + else + NewDI = Builder.CreateDetach(RecurDet, RecurCont, UnwindDest, + SyncRegion); + NewDI->setDebugLoc(TLDebugLoc); + RecurHead->getTerminator()->eraseFromParent(); + + // Create the reattach. + Builder.SetInsertPoint(RecurCallDest->getTerminator()); + ReattachInst *RI = Builder.CreateReattach(RecurCont, SyncRegion); + RI->setDebugLoc(TLDebugLoc); + RecurCallDest->getTerminator()->eraseFromParent(); + } +} + +/// Examine a given loop to determine if its a Tapir loop that can and should be +/// processed. Returns the Task that encodes the loop body if so, or nullptr if +/// not. +Task *LoopSpawningImpl::getTaskIfTapirLoop(const Loop *L) { + NamedRegionTimer NRT("getTaskIfTapirLoop", + "Check if loop is a Tapir loop to process", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + + LLVM_DEBUG(dbgs() << "Analyzing for spawning: " << *L); + + TapirLoopHints Hints(L); + + // Loop must have a preheader. LoopSimplify should guarantee that the loop + // preheader is not terminated by a sync. + const BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) { + LLVM_DEBUG(dbgs() << "Loop lacks a preheader.\n"); + if (hintsDemandOutlining(Hints)) { + ORE.emit(TapirLoopInfo::createMissedAnalysis(LS_NAME, "NoPreheader", L) + << "loop lacks a preheader"); + emitMissedWarning(L, Hints, &ORE); + } + return nullptr; + } else if (!isa(Preheader->getTerminator())) { + LLVM_DEBUG(dbgs() << "Loop preheader is not terminated by a branch.\n"); + if (hintsDemandOutlining(Hints)) { + ORE.emit(TapirLoopInfo::createMissedAnalysis(LS_NAME, "ComplexPreheader", + L) + << "loop preheader not terminated by a branch"); + emitMissedWarning(L, Hints, &ORE); + } + return nullptr; + } + + // Get the task for this loop if it is a Tapir loop. + Task *T = llvm::getTaskIfTapirLoop(L, &TI); + if (!T) { + LLVM_DEBUG(dbgs() << "Loop does not match structure of Tapir loop.\n"); + if (hintsDemandOutlining(Hints)) { + ORE.emit(TapirLoopInfo::createMissedAnalysis(LS_NAME, "NonCanonicalLoop", + L) + << "loop does not have the canonical structure of a Tapir loop"); + emitMissedWarning(L, Hints, &ORE); + } + return nullptr; + } + + return T; +} + +/// Get the LoopOutlineProcessor for handling Tapir loop \p TL. +LoopOutlineProcessor *LoopSpawningImpl::getOutlineProcessor(TapirLoopInfo *TL) { + NamedRegionTimer NRT("getOutlineProcessor", + "Get a loop-outline processor for a Tapir loop", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + + // Allow the Tapir target to define a custom loop-outline processor. + if (LoopOutlineProcessor *TargetLOP = Target->getLoopOutlineProcessor(TL)) + return TargetLOP; + + Module &M = *F.getParent(); + Loop *L = TL->getLoop(); + TapirLoopHints Hints(L); + + switch (Hints.getStrategy()) { + case TapirLoopHints::ST_DAC: return new DACSpawning(M); + default: return new DefaultLoopOutlineProcessor(M); + } +} + +/// Associate tasks with Tapir loops that enclose them. +void LoopSpawningImpl::associateTasksToTapirLoops() { + NamedRegionTimer NRT("associateTasksToTapirLoops", + "Associate tasks to Tapir loops", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + + SmallVector UnassocTasks; + // Traverse the tasks in post order, queueing up tasks that are not roots of + // Tapir loops. + for (Task *T : post_order(TI.getRootTask())) { + TapirLoopInfo *TL = getTapirLoop(T); + if (!TL) { + UnassocTasks.push_back(T); + continue; + } + + // When we find a Task T at the root of a Tapir loop TL, associate + // previously traversed tasks that are enclosed in T with TL. + while (!UnassocTasks.empty()) { + Task *UT = UnassocTasks.back(); + if (!TI.encloses(T, UT)) + break; + TL->addDescendantTask(UT); + UnassocTasks.pop_back(); + } + } +} + +// Helper test to see if the given basic block is the placeholder normal +// destination of a detached.rethrow or taskframe.resume intrinsic. +static bool isUnreachablePlaceholder(const BasicBlock *B) { + for (const BasicBlock *Pred : predecessors(B)) { + if (!isDetachedRethrow(Pred->getTerminator()) && + !isTaskFrameResume(Pred->getTerminator())) + return false; + if (B != cast(Pred->getTerminator())->getNormalDest()) + return false; + } + return true; +} + +/// Get the set of basic blocks within the task of Tapir loop \p TL. The \p +/// TaskBlocks vector stores all of these basic blocks. The \p ReattachBlocks +/// set identifies which blocks are terminated by a reattach instruction that +/// terminates the task. The \p DetachedRethrowBlocks set identifies which +/// blocks are terminated by detached-rethrow instructions that terminate the +/// task. Entry points to shared exception-handling code is stored in the +/// \p SharedEHEntries set. +/// +/// This method relies on being executed on the Tapir loops in a function in +/// post order. +void LoopSpawningImpl::getTapirLoopTaskBlocks( + TapirLoopInfo *TL, std::vector &TaskBlocks, + SmallPtrSetImpl &ReattachBlocks, + SmallPtrSetImpl &DetachedRethrowBlocks, + SmallPtrSetImpl &SharedEHEntries, + SmallPtrSetImpl &UnreachableExits) { + NamedRegionTimer NRT("getTapirLoopTaskBlocks", + "Get basic blocks for Tapir loop", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + + Task *T = TL->getTask(); + SmallVector EnclosedTasks; + TL->getEnclosedTasks(EnclosedTasks); + SmallPtrSet VisitedSharedEH; + + // Get the header and loop-latch blocks of all Tapir subloops. + SmallPtrSet SubloopControlToExclude; + for (Task *EncT : EnclosedTasks) { + for (Task *SubT : EncT->subtasks()) { + if (TapirLoopInfo *SubTL = getTapirLoop(SubT)) { + SubloopControlToExclude.insert(SubTL->getLoop()->getHeader()); + SubloopControlToExclude.insert(SubTL->getLoop()->getLoopLatch()); + // Mark the unwind destination of this subloop's detach as a + // "SharedEHEntry," meaning it needs its Phi nodes updated after + // cloning. + DetachInst *SubDI = + cast(SubTL->getLoop()->getHeader()->getTerminator()); + if (SubDI->hasUnwindDest()) + SharedEHEntries.insert(SubDI->getUnwindDest()); + } + } + } + + for (Task *EncT : EnclosedTasks) { + for (Spindle *S : depth_first>(EncT->getEntrySpindle())) { + // Record the entry blocks of any shared-EH spindles. + if (S->isSharedEH()) { + SharedEHEntries.insert(S->getEntry()); + if (!VisitedSharedEH.insert(S).second) + continue; + } + + bool TopLevelTaskSpindle = T->contains(S) || T->isSharedEHExit(S); + for (BasicBlock *B : S->blocks()) { + // Don't clone header and loop-latch blocks for Tapir subloops. + if (SubloopControlToExclude.count(B)) + continue; + + // Skip basic blocks that are successors of detached rethrows in T. + // They're dead anyway. + if (TopLevelTaskSpindle && isSuccessorOfDetachedRethrow(B)) + continue; + + // Skip unreachable placeholder blocks, namely, the normal destinations + // of detached.rethrow and taskframe.resume instructions. + if (isUnreachablePlaceholder(B)) + continue; + + LLVM_DEBUG(dbgs() << "Adding block " << B->getName() << "\n"); + TaskBlocks.push_back(B); + + if (TopLevelTaskSpindle) { + // Record the blocks terminated by reattaches and detached rethrows. + if (isa(B->getTerminator())) + ReattachBlocks.insert(B); + if (isDetachedRethrow(B->getTerminator())) + DetachedRethrowBlocks.insert(B); + if (isTaskFrameResume(B->getTerminator())) + UnreachableExits.insert( + cast(B->getTerminator())->getNormalDest()); + } else if (isDetachedRethrow(B->getTerminator()) || + isTaskFrameResume(B->getTerminator())) { + UnreachableExits.insert( + cast(B->getTerminator())->getNormalDest()); + } + } + } + } +} + +/// Compute the grainsize of the loop, based on the limit. Currently this +/// routine injects a call to the tapir_loop_grainsize intrinsic, which is +/// handled in a target-specific way by subsequent lowering passes. +static Value *computeGrainsize(TapirLoopInfo *TL) { + Value *TripCount = TL->getTripCount(); + assert(TripCount && + "No trip count found for computing grainsize of Tapir loop."); + Type *IdxTy = TripCount->getType(); + BasicBlock *Preheader = TL->getLoop()->getLoopPreheader(); + Module *M = Preheader->getModule(); + IRBuilder<> B(Preheader->getTerminator()); + B.SetCurrentDebugLocation(TL->getDebugLoc()); + return B.CreateCall( + Intrinsic::getDeclaration(M, Intrinsic::tapir_loop_grainsize, + { IdxTy }), { TripCount }); +} + +/// Get the grainsize of this loop either from metadata or by computing the +/// grainsize. +static Value *getGrainsizeVal(TapirLoopInfo *TL) { + Value *GrainVal; + if (unsigned Grainsize = TL->getGrainsize()) + GrainVal = ConstantInt::get(TL->getTripCount()->getType(), Grainsize); + else + GrainVal = computeGrainsize(TL); + + LLVM_DEBUG(dbgs() << "Grainsize value: " << *GrainVal << "\n"); + return GrainVal; +} + +/// Determine the inputs to Tapir loop \p TL for the loop control. +static void getLoopControlInputs(TapirLoopInfo *TL, + SmallVectorImpl &LCArgs, + SmallVectorImpl &LCInputs) { + // Add an argument for the primary induction variable. + auto &PrimaryInduction = TL->getPrimaryInduction(); + PHINode *PrimaryPhi = PrimaryInduction.first; + TL->StartIterArg = new Argument(PrimaryPhi->getType(), + PrimaryPhi->getName() + ".start"); + LCArgs.push_back(TL->StartIterArg); + LCInputs.push_back(PrimaryInduction.second.getStartValue()); + + // Add an argument for the trip count. + Value *TripCount = TL->getTripCount(); + assert(TripCount && "No trip count found for Tapir loop end argument."); + TL->EndIterArg = new Argument(TripCount->getType(), "end"); + LCArgs.push_back(TL->EndIterArg); + LCInputs.push_back(TripCount); + + // Add an argument for the grainsize. + Value *GrainsizeVal = getGrainsizeVal(TL); + TL->GrainsizeArg = new Argument(GrainsizeVal->getType(), "grainsize"); + LCArgs.push_back(TL->GrainsizeArg); + LCInputs.push_back(GrainsizeVal); + + assert(TL->getInductionVars()->size() == 1 && + "Induction vars to process for arguments."); + // // Add arguments for the other IV's. + // for (auto &InductionEntry : *TL->getInductionVars()) { + // PHINode *Phi = InductionEntry.first; + // InductionDescriptor II = InductionEntry.second; + // if (Phi == PrimaryInduction.first) continue; + // LCArgs.push_back(new Argument(Phi->getType(), + // Phi->getName() + ".start")); + // LCInputs.push_back(II.getStartValue()); + // } +} + +/// For all recorded Tapir loops, determine the function arguments and inputs +/// for the outlined helper functions for those loops. +/// +/// The \p LoopArgs map will store the function arguments for these outlined +/// loop helpers. The \p LoopInputs map will store the corresponding arguments +/// for calling those outlined helpers from the parent function. The \p +/// LoopArgStarts map will store the instruction in the parent where new code +/// for computing these outlined-helper-call arguments is first inserted. +void LoopSpawningImpl::getAllTapirLoopInputs( + DenseMap &LoopInputSets, + DenseMap> &LoopCtlArgs, + DenseMap> &LoopCtlInputs) { + NamedRegionTimer NRT("getAllTapirLoopInputs", + "Determine inputs for all Tapir loops", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + + // Determine the inputs for all tasks. + TaskValueSetMap TaskInputs = findAllTaskInputs(F, DT, TI); + + // Combine these sets of inputs to determine inputs for each Tapir loop. + DenseMap TapirLoopInputs; + for (Task *T : post_order(TI.getRootTask())) { + if (TapirLoopInfo *TL = getTapirLoop(T)) { + Loop *L = TL->getLoop(); + + // Convert inputs for task T to Tapir-loop inputs. + ValueSet TLInputs = getTapirLoopInputs(TL, TaskInputs[T]); + LoopInputSets[L] = TLInputs; + LLVM_DEBUG({ + dbgs() << "TLInputs\n"; + for (Value *V : TLInputs) + dbgs() << "\t" << *V << "\n"; + }); + + // Determine loop-control inputs. + getLoopControlInputs(TL, LoopCtlArgs[L], LoopCtlInputs[L]); + + LLVM_DEBUG({ + dbgs() << "LoopCtlArgs:\n"; + for (Value *V : LoopCtlArgs[L]) + dbgs() << "\t" << *V << "\n"; + dbgs() << "LoopCtlInputs:\n"; + for (Value *V : LoopCtlInputs[L]) + dbgs() << "\t" << *V << "\n"; + }); + } + } +} + +static void updateClonedIVs( + TapirLoopInfo *TL, BasicBlock *OrigPreheader, + ValueSet &Args, ValueToValueMapTy &VMap, unsigned IVArgIndex, + unsigned NextIVArgOffset = 3) { + NamedRegionTimer NRT("updateClonedIVs", "Updated IVs in Tapir-loop helper", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + + auto &PrimaryInduction = TL->getPrimaryInduction(); + PHINode *PrimaryPhi = PrimaryInduction.first; + + Value *PrimaryArg = Args[IVArgIndex]; + + // TODO: This assertion implies that the following loop should only run once, + // for the primary induction variable. However, the loop is provided in case + // we decide to handle more complicated sets of induction variables in the + // future. + assert(TL->getInductionVars()->size() == 1 && + "updateClonedIVs to process multiple inductions."); + + // Get the next argument that provides an input to an IV, which is typically 3 + // after the input for the primary induction variable, after the end-teration + // and grainsize arguments. + unsigned ArgIdx = IVArgIndex + NextIVArgOffset; + for (auto &InductionEntry : *TL->getInductionVars()) { + PHINode *OrigPhi = InductionEntry.first; + InductionDescriptor II = InductionEntry.second; + assert(II.getKind() == InductionDescriptor::IK_IntInduction && + "Non-integer induction found."); + assert((II.getConstIntStepValue()->isOne() || + II.getConstIntStepValue()->isMinusOne()) && + "Non-canonical induction found: non-unit step."); + assert(isa(II.getStartValue()) && + "Non-canonical induction found: non-constant start."); + assert(cast(II.getStartValue())->isNullValue() && + "Non-canonical induction found: non-zero start."); + + // Get the remapped PHI node and preheader + PHINode *NewPhi = cast(VMap[OrigPhi]); + BasicBlock *NewPreheader = cast(VMap[OrigPreheader]); + + // Replace the input for the remapped PHI node from the preheader with the + // input argument. + unsigned BBIdx = NewPhi->getBasicBlockIndex(NewPreheader); + if (OrigPhi == PrimaryPhi) + NewPhi->setIncomingValue(BBIdx, VMap[PrimaryArg]); + else + // TODO: Because of the assertion above, this line should never run. + NewPhi->setIncomingValue(BBIdx, VMap[Args[ArgIdx++]]); + } +} + +namespace { +// ValueMaterializer to manage remapping uses of the tripcount in the helper +// function for the loop, when the only uses of tripcount occur in the condition +// for the loop backedge and, possibly, in metadata. +class ArgEndMaterializer final : public OutlineMaterializer { +private: + Value *TripCount; + Value *ArgEnd; +public: + ArgEndMaterializer(const Instruction *SrcSyncRegion, Value *TripCount, + Value *ArgEnd) + : OutlineMaterializer(SrcSyncRegion), TripCount(TripCount), + ArgEnd(ArgEnd) {} + + Value *materialize(Value *V) final { + // If we're materializing metadata for TripCount, materialize empty metadata + // instead. + if (auto *MDV = dyn_cast(V)) { + Metadata *MD = MDV->getMetadata(); + if (auto *LAM = dyn_cast(MD)) + if (LAM->getValue() == TripCount) + return MetadataAsValue::get( + V->getContext(), MDTuple::get(V->getContext(), std::nullopt)); + } + + // Materialize TripCount with ArgEnd. This should only occur in the loop + // latch, and we'll overwrite the use of ArgEnd later. + if (V == TripCount) + return ArgEnd; + + // Otherwise go with the default behavior. + return OutlineMaterializer::materialize(V); + } +}; +} + +/// Outline Tapir loop \p TL into a helper function. The \p Args set specified +/// the arguments to that helper function. The map \p VMap will store the +/// mapping of values in the original function to values in the outlined helper. +Function *LoopSpawningImpl::createHelperForTapirLoop( + TapirLoopInfo *TL, ValueSet &Args, unsigned IVArgIndex, + unsigned LimitArgIndex, Module *DestM, ValueToValueMapTy &VMap, + ValueToValueMapTy &InputMap) { + Task *T = TL->getTask(); + Loop *L = TL->getLoop(); + BasicBlock *Header = L->getHeader(); + BasicBlock *Preheader = L->getLoopPreheader(); + + // Collect all basic blocks in the Tapir loop. + std::vector TLBlocks; + TLBlocks.push_back(L->getHeader()); + // Entry blocks of shared-EH spindles may contain PHI nodes that need to be + // rewritten in the cloned helper. + SmallPtrSet SharedEHEntries; + SmallPtrSet DetachedRethrowBlocks; + SmallPtrSet UnreachableExits; + // Reattach instructions and detached rethrows in this task might need special + // handling. + SmallPtrSet ReattachBlocks; + getTapirLoopTaskBlocks(TL, TLBlocks, ReattachBlocks, DetachedRethrowBlocks, + SharedEHEntries, UnreachableExits); + TLBlocks.push_back(L->getLoopLatch()); + + DetachInst *DI = T->getDetach(); + const Instruction *InputSyncRegion = + dyn_cast(DI->getSyncRegion()); + + OutlineMaterializer *Mat = nullptr; + if (!isa(TL->getTripCount()) && !Args.count(TL->getTripCount())) + // Create an ArgEndMaterializer to handle uses of TL->getTripCount(). + Mat = new ArgEndMaterializer(InputSyncRegion, TL->getTripCount(), + Args[LimitArgIndex]); + else + Mat = new OutlineMaterializer(InputSyncRegion); + + Twine NameSuffix = ".ls" + Twine(TL->getLoop()->getLoopDepth()); + SmallVector Returns; // Ignore returns cloned. + ValueSet Outputs; // Outputs must be empty. + Function *Helper; + { + NamedRegionTimer NRT("CreateHelper", "Create helper for Tapir loop", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + Helper = CreateHelper( + Args, Outputs, TLBlocks, Header, Preheader, TL->getExitBlock(), VMap, + DestM, F.getSubprogram() != nullptr, Returns, NameSuffix.str(), nullptr, + &DetachedRethrowBlocks, &SharedEHEntries, TL->getUnwindDest(), + &UnreachableExits, nullptr, nullptr, nullptr, Mat); + } // end timed region + + assert(Returns.empty() && "Returns cloned when cloning detached CFG."); + // If the Tapir loop has no unwind destination, then the outlined function + // cannot throw. + if (F.doesNotThrow() && !TL->getUnwindDest()) + Helper->setDoesNotThrow(); + // Don't inherit the noreturn attribute from the caller. + if (F.doesNotReturn()) + Helper->removeFnAttr(Attribute::NoReturn); + + // Update cloned loop condition to use the end-iteration argument. + unsigned TripCountIdx = 0; + Value *TripCount = TL->getTripCount(); + if (InputMap[TripCount]) + TripCount = InputMap[TripCount]; + if (TL->getCondition()->getOperand(0) != TripCount) + ++TripCountIdx; + assert(TL->getCondition()->getOperand(TripCountIdx) == TripCount && + "Trip count not used in condition"); + ICmpInst *ClonedCond = cast(VMap[TL->getCondition()]); + ClonedCond->setOperand(TripCountIdx, VMap[Args[LimitArgIndex]]); + + // If the trip count is variable and we're not passing the trip count as an + // argument, undo the eariler temporarily mapping. + if (!isa(TL->getTripCount()) && !Args.count(TL->getTripCount())) { + VMap.erase(TL->getTripCount()); + } + + // Delete the ArgEndMaterializer or OutlineMaterializer. + if (Mat) + delete Mat; + + // Rewrite cloned IV's to start at their start-iteration arguments. + updateClonedIVs(TL, Preheader, Args, VMap, IVArgIndex); + + // Add alignment assumptions to arguments of helper, based on alignment of + // values in old function. + { + NamedRegionTimer NRT("AddAlignmentAssumptions", + "Add alignment assumptions to Tapir-loop helper", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + AddAlignmentAssumptions(&F, Args, VMap, Preheader->getTerminator(), &AC, &DT); + } // end timed region + + // CreateHelper partially serializes the cloned copy of the loop by converting + // detached-rethrows into resumes. We now finish the job of serializing the + // cloned Tapir loop. + + // Move allocas in the newly cloned detached CFG to the entry block of the + // helper. + { + NamedRegionTimer NRT("updateAllocas", "Update allocas in Tapir-loop helper", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + // Collect the end instructions of the task. + SmallVector TaskEnds; + for (BasicBlock *EndBlock : ReattachBlocks) + TaskEnds.push_back(cast(VMap[EndBlock])->getTerminator()); + for (BasicBlock *EndBlock : DetachedRethrowBlocks) + TaskEnds.push_back(cast(VMap[EndBlock])->getTerminator()); + + // Move allocas in cloned detached block to entry of helper function. + BasicBlock *ClonedTaskEntry = cast(VMap[T->getEntry()]); + bool ContainsDynamicAllocas = MoveStaticAllocasInBlock( + &Helper->getEntryBlock(), ClonedTaskEntry, TaskEnds); + + // If this task uses a taskframe, move allocas in cloned taskframe entry to + // entry of helper function. + if (Spindle *TFCreate = T->getTaskFrameCreateSpindle()) { + BasicBlock *ClonedTFEntry = cast(VMap[TFCreate->getEntry()]); + ContainsDynamicAllocas |= MoveStaticAllocasInBlock( + &Helper->getEntryBlock(), ClonedTFEntry, TaskEnds); + } + // If the cloned loop contained dynamic alloca instructions, wrap the cloned + // loop with llvm.stacksave/llvm.stackrestore intrinsics. + if (ContainsDynamicAllocas) { + Module *M = Helper->getParent(); + // Get the two intrinsics we care about. + Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave); + Function *StackRestore = + Intrinsic::getDeclaration(M, Intrinsic::stackrestore); + + // Insert the llvm.stacksave. + CallInst *SavedPtr = + IRBuilder<>(&*ClonedTaskEntry, ClonedTaskEntry->begin()) + .CreateCall(StackSave, {}, "savedstack"); + + // Insert a call to llvm.stackrestore before the reattaches in the + // original Tapir loop. + for (Instruction *ExitPoint : TaskEnds) + IRBuilder<>(ExitPoint).CreateCall(StackRestore, SavedPtr); + } + } + + // Convert the cloned detach and reattaches into unconditional branches. + { + NamedRegionTimer NRT("serializeClonedLoop", "Serialize cloned Tapir loop", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + DetachInst *ClonedDI = cast(VMap[DI]); + BasicBlock *ClonedDetacher = ClonedDI->getParent(); + BasicBlock *ClonedContinue = ClonedDI->getContinue(); + for (BasicBlock *RB : ReattachBlocks) { + ReattachInst *ClonedRI = cast(VMap[RB->getTerminator()]); + ReplaceInstWithInst(ClonedRI, BranchInst::Create(ClonedContinue)); + } + ClonedContinue->removePredecessor(ClonedDetacher); + BranchInst *DetachRepl = BranchInst::Create(ClonedDI->getDetached()); + ReplaceInstWithInst(ClonedDI, DetachRepl); + VMap[DI] = DetachRepl; + } // end timed region + + return Helper; +} + +/// Outline all recorded Tapir loops in the function. +TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { + // Prepare Tapir loops for outlining. + for (Task *T : post_order(TI.getRootTask())) { + if (TapirLoopInfo *TL = getTapirLoop(T)) { + PredicatedScalarEvolution PSE(SE, *TL->getLoop()); + bool canOutline = TL->prepareForOutlining(DT, LI, TI, PSE, AC, LS_NAME, + ORE, TTI); + if (!canOutline) { + const Loop *L = TL->getLoop(); + TapirLoopHints Hints(L); + emitMissedWarning(L, Hints, &ORE); + forgetTapirLoop(TL); + continue; + } + + // Get an outline processor for each Tapir loop. + OutlineProcessors[TL] = + std::unique_ptr(getOutlineProcessor(TL)); + } + } + + TaskOutlineMapTy TaskToOutline; + DenseMap LoopInputSets; + DenseMap> LoopCtlArgs; + DenseMap> LoopCtlInputs; + + DenseMap LoopArgs; + DenseMap> LoopInputs; + DenseMap LoopArgStarts; + + getAllTapirLoopInputs(LoopInputSets, LoopCtlArgs, LoopCtlInputs); + + associateTasksToTapirLoops(); + + for (Task *T : post_order(TI.getRootTask())) { + LLVM_DEBUG(dbgs() << "Examining task@" << T->getEntry()->getName() << + " for outlining\n"); + // If any subtasks were outlined as Tapir loops, replace these loops with + // calls to the outlined functions. + { + NamedRegionTimer NRT("replaceSubLoopCalls", + "Update sub-Tapir-loops with calls to helpers", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + for (Task *SubT : T->subtasks()) { + if (TapirLoopInfo *TL = getTapirLoop(SubT)) { + // emitSCEVChecks(TL->getLoop(), TL->getBypass()); + Loop *L = TL->getLoop(); + TaskToOutline[SubT].replaceReplCall( + replaceLoopWithCallToOutline(TL, TaskToOutline[SubT], LoopInputs[L])); + } + } + } // end timed region + + TapirLoopInfo *TL = getTapirLoop(T); + if (!TL) + continue; + + Loop *L = TL->getLoop(); + LLVM_DEBUG(dbgs() << "Outlining Tapir " << *L << "\n"); + + // Convert the inputs of the Tapir loop to inputs to the helper. + ValueSet TLInputsFixed; + ValueToValueMapTy InputMap; + Instruction *ArgStart; + { + NamedRegionTimer NRT("fixupHelperInputs", + "Fixup inputs to Tapir-loop body", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + ArgStart = + fixupHelperInputs(F, T, LoopInputSets[L], TLInputsFixed, + L->getLoopPreheader()->getTerminator(), + &*L->getHeader()->getFirstInsertionPt(), + OutlineProcessors[TL]->getArgStructMode(), InputMap, + L); + } // end timed region + + ValueSet HelperArgs; + SmallVector HelperInputs; + { + NamedRegionTimer NRT("setupLoopOutlineArgs", + "Setup inputs to Tapir-loop helper function", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + OutlineProcessors[TL]->setupLoopOutlineArgs( + F, HelperArgs, HelperInputs, LoopInputSets[L], LoopCtlArgs[L], + LoopCtlInputs[L], TLInputsFixed); + } // end timed region + + LLVM_DEBUG({ + dbgs() << "HelperArgs:\n"; + for (Value *V : HelperArgs) + dbgs() << "\t" << *V << "\n"; + dbgs() << "HelperInputs:\n"; + for (Value *V : HelperInputs) + dbgs() << "\t" << *V << "\n"; + }); + + LoopArgs[L] = HelperArgs; + for (Value *V : HelperInputs) + LoopInputs[L].push_back(V); + LoopArgStarts[L] = ArgStart; + + ValueToValueMapTy VMap; + // Create the helper function. + Function *Outline = createHelperForTapirLoop( + TL, LoopArgs[L], OutlineProcessors[TL]->getIVArgIndex(F, LoopArgs[L]), + OutlineProcessors[TL]->getLimitArgIndex(F, LoopArgs[L]), + &OutlineProcessors[TL]->getDestinationModule(), VMap, InputMap); + TaskToOutline[T] = TaskOutlineInfo( + Outline, T->getEntry(), cast(VMap[T->getDetach()]), + dyn_cast_or_null(VMap[T->getTaskFrameUsed()]), + LoopInputSets[L], LoopArgStarts[L], + L->getLoopPreheader()->getTerminator(), TL->getExitBlock(), + TL->getUnwindDest()); + + // Do ABI-dependent processing of each outlined Tapir loop. + { + NamedRegionTimer NRT("postProcessOutline", + "Post-process Tapir-loop helper function", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + OutlineProcessors[TL]->postProcessOutline(*TL, TaskToOutline[T], VMap); + } // end timed region + + LLVM_DEBUG({ + dbgs() << "LoopInputs[L]:\n"; + for (Value *V : LoopInputs[L]) + dbgs() << "\t" << *V << "\n"; + }); + + { + NamedRegionTimer NRT("clearMetadata", "Cleanup Tapir-loop metadata", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + TapirLoopHints Hints(L); + Hints.clearClonedLoopMetadata(VMap); + Hints.clearStrategy(); + } + + // Update subtask outline info to reflect the fact that their spawner was + // outlined. + { + NamedRegionTimer NRT("remapData", "Remap Tapir subloop information", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + LLVM_DEBUG(dbgs() << "Remapping subloop outline info.\n"); + for (Loop *SubL : *L) { + if (TapirLoopInfo *SubTL = getTapirLoop(SubL)) { + Task *SubT = SubTL->getTask(); + if (TaskToOutline.count(SubT)) { + TaskToOutline[SubT].remapOutlineInfo(VMap, InputMap); + OutlineProcessors[SubTL]->remapData(VMap); + } + } + } + } + } + + return TaskToOutline; +} + +bool LoopSpawningImpl::run() { + if (TI.isSerial()) + return false; + + // Discover all Tapir loops and record them. + for (Loop *TopLevelLoop : LI) + for (Loop *L : post_order(TopLevelLoop)) + if (Task *T = getTaskIfTapirLoop(L)) + createTapirLoop(L, T); + + if (TapirLoops.empty()) + return false; + + // Perform any Target-dependent preprocessing of F. + Target->preProcessFunction(F, TI, true); + + // Outline all Tapir loops. + TaskOutlineMapTy TapirLoopOutlines = outlineAllTapirLoops(); + + // Perform target-specific processing of the outlined-loop calls. + { + NamedRegionTimer NRT("processOutlinedLoopCall", + "Process calls to outlined loops", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + for (Task *T : post_order(TI.getRootTask())) + if (TapirLoopInfo *TL = getTapirLoop(T)) + OutlineProcessors[TL]->processOutlinedLoopCall(*TL, TapirLoopOutlines[T], + DT); + } // end timed region + + // Perform any Target-dependent postprocessing of F. + Target->postProcessFunction(F, true); + + LLVM_DEBUG({ + NamedRegionTimer NRT("verify", "Post-loop-spawning verification", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + if (verifyModule(*F.getParent(), &errs())) { + LLVM_DEBUG(dbgs() << "Module after loop spawning:" << *F.getParent()); + llvm_unreachable("Loop spawning produced bad IR!"); + } + }); + + return true; +} + +PreservedAnalyses LoopSpawningPass::run(Module &M, ModuleAnalysisManager &AM) { + auto &FAM = AM.getResult(M).getManager(); + auto GetDT = [&FAM](Function &F) -> DominatorTree & { + return FAM.getResult(F); + }; + auto GetLI = [&FAM](Function &F) -> LoopInfo & { + return FAM.getResult(F); + }; + auto GetTI = [&FAM](Function &F) -> TaskInfo & { + return FAM.getResult(F); + }; + auto GetSE = [&FAM](Function &F) -> ScalarEvolution & { + return FAM.getResult(F); + }; + auto GetAC = [&FAM](Function &F) -> AssumptionCache & { + return FAM.getResult(F); + }; + auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { + return FAM.getResult(F); + }; + auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult(F); + }; + auto GetORE = [&FAM](Function &F) -> OptimizationRemarkEmitter & { + return FAM.getResult(F); + }; + + SmallVector WorkList; + bool Changed = false; + for (Function &F : M) + if (!F.empty()) + WorkList.push_back(&F); + + // Transform all loops into simplified, LCSSA form before we process them. + for (Function *F : WorkList) { + LoopInfo &LI = GetLI(*F); + DominatorTree &DT = GetDT(*F); + ScalarEvolution &SE = GetSE(*F); + SmallVector LoopWorkList; + for (Loop *L : LI) { + Changed |= simplifyLoop(L, &DT, &LI, &SE, &GetAC(*F), nullptr, + /* PreserveLCSSA */ false); + LoopWorkList.push_back(L); + } + for (Loop *L : LoopWorkList) + Changed |= formLCSSARecursively(*L, DT, &LI, &SE); + } + + // Now process each loop. + for (Function *F : WorkList) { + TapirTargetID TargetID = GetTLI(*F).getTapirTarget(); + std::unique_ptr Target(getTapirTargetFromID(M, TargetID)); + Changed |= LoopSpawningImpl(*F, GetDT(*F), GetLI(*F), GetTI(*F), GetSE(*F), + GetAC(*F), GetTTI(*F), Target.get(), GetORE(*F)) + .run(); + } + if (Changed) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +namespace { +// NB: Technicaly LoopSpawningTI should be a ModulePass, because it changes the +// contents of the module. But because a ModulePass cannot use many function +// analyses -- doing so results in invalid memory accesses -- we have to make +// LoopSpawningTI a FunctionPass. This problem is fixed with the new pass +// manager. +struct LoopSpawningTI : public FunctionPass { + /// Pass identification, replacement for typeid + static char ID; + explicit LoopSpawningTI() : FunctionPass(ID) { + initializeLoopSpawningTIPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + Module &M = *F.getParent(); + + auto &DT = getAnalysis().getDomTree(); + auto &LI = getAnalysis().getLoopInfo(); + auto &TI = getAnalysis().getTaskInfo(); + auto &SE = getAnalysis().getSE(); + auto &AC = getAnalysis().getAssumptionCache(F); + auto &TLI = getAnalysis().getTLI(F); + TapirTargetID TargetID = TLI.getTapirTarget(); + auto &TTI = getAnalysis().getTTI(F); + auto &ORE = getAnalysis().getORE(); + + LLVM_DEBUG(dbgs() << "LoopSpawningTI on function " << F.getName() << "\n"); + TapirTarget *Target = getTapirTargetFromID(M, TargetID); + bool Changed = + LoopSpawningImpl(F, DT, LI, TI, SE, AC, TTI, Target, ORE).run(); + delete Target; + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } +}; +} + +char LoopSpawningTI::ID = 0; +static const char ls_name[] = "Loop Spawning with Task Info"; +INITIALIZE_PASS_BEGIN(LoopSpawningTI, LS_NAME, ls_name, false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_END(LoopSpawningTI, LS_NAME, ls_name, false, false) + +namespace llvm { +Pass *createLoopSpawningTIPass() { + return new LoopSpawningTI(); +} +} diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp new file mode 100644 index 000000000000000..02e04f4fefbf0d3 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp @@ -0,0 +1,1559 @@ +//===- LoopStripMine.cpp - Loop strip-mining utilities --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements some loop strip-mining utilities. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/LoopStripMine.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Tapir/TapirLoopInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/SimplifyIndVar.h" +#include "llvm/Transforms/Utils/TapirUtils.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" + +using namespace llvm; + +#define LSM_NAME "loop-stripmine" +#define DEBUG_TYPE LSM_NAME + +static cl::opt StripMineCount( + "stripmine-count", cl::Hidden, + cl::desc("Use this stripmine count for all loops, for testing purposes")); + +static cl::opt StripMineCoarseningFactor( + "stripmine-coarsen-factor", cl::Hidden, + cl::desc("Use this coarsening factor for stripmining")); + +static cl::opt StripMineUnrollRemainder( + "stripmine-unroll-remainder", cl::Hidden, + cl::desc("Allow the loop remainder after stripmining to be unrolled.")); + +/// Constants for stripmining cost analysis. +namespace StripMineConstants { +/// Default coarsening factor for strpimined Tapir loops. +const unsigned DefaultCoarseningFactor = 2048; +} + +/// The function chooses which type of stripmine (epilog or prolog) is more +/// profitabale. +/// Epilog stripmine is more profitable when there is PHI that starts from +/// constant. In this case epilog will leave PHI start from constant, +/// but prolog will convert it to non-constant. +/// +/// loop: +/// PN = PHI [I, Latch], [CI, Preheader] +/// I = foo(PN) +/// ... +/// +/// Epilog stripmine case. +/// loop: +/// PN = PHI [I2, Latch], [CI, Preheader] +/// I1 = foo(PN) +/// I2 = foo(I1) +/// ... +/// Prolog stripmine case. +/// NewPN = PHI [PrologI, Prolog], [CI, Preheader] +/// loop: +/// PN = PHI [I2, Latch], [NewPN, Preheader] +/// I1 = foo(PN) +/// I2 = foo(I1) +/// ... +/// +static bool isEpilogProfitable(const Loop *L) { + const BasicBlock *Preheader = L->getLoopPreheader(); + const BasicBlock *Header = L->getHeader(); + assert(Preheader && Header); + for (const PHINode &PN : Header->phis()) { + if (isa(PN.getIncomingValueForBlock(Preheader))) + return true; + } + return false; +} + +/// Perform some cleanup and simplifications on loops after stripmining. It is +/// useful to simplify the IV's in the new loop, as well as do a quick +/// simplify/dce pass of the instructions. +void llvm::simplifyLoopAfterStripMine(Loop *L, bool SimplifyIVs, LoopInfo *LI, + ScalarEvolution *SE, DominatorTree *DT, + const TargetTransformInfo &TTI, + AssumptionCache *AC) { + // Simplify any new induction variables in the stripmined loop. + if (SE && SimplifyIVs) { + SmallVector DeadInsts; + simplifyLoopIVs(L, SE, DT, LI, &TTI, DeadInsts); + + // Aggressively clean up dead instructions that simplifyLoopIVs already + // identified. Any remaining should be cleaned up below. + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null(&*DeadInsts.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(Inst); + } + + // At this point, the code is well formed. We now do a quick sweep over the + // inserted code, doing constant propagation and dead code elimination as we + // go. + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + const std::vector &NewLoopBlocks = L->getBlocks(); + for (BasicBlock *BB : NewLoopBlocks) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { + Instruction *Inst = &*I++; + + if (Value *V = simplifyInstruction(Inst, {DL, nullptr, DT, AC})) + if (LI->replacementPreservesLCSSAForm(Inst, V)) + Inst->replaceAllUsesWith(V); + if (isInstructionTriviallyDead(Inst)) + Inst->eraseFromParent(); + } + } + + // TODO: after stripmining, previously loop variant conditions are likely to + // fold to constants, eagerly propagating those here will require fewer + // cleanup passes to be run. Alternatively, a LoopEarlyCSE might be + // appropriate. +} + +/// Gather the various unrolling parameters based on the defaults, compiler +/// flags, TTI overrides and user specified parameters. +TargetTransformInfo::StripMiningPreferences llvm::gatherStripMiningPreferences( + Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, + std::optional UserCount) { + TargetTransformInfo::StripMiningPreferences SMP; + + // Set up the defaults + SMP.Count = 0; + SMP.AllowExpensiveTripCount = false; + SMP.DefaultCoarseningFactor = + (StripMineCoarseningFactor.getNumOccurrences() > 0) ? + StripMineCoarseningFactor : StripMineConstants::DefaultCoarseningFactor; + SMP.UnrollRemainder = false; + + // Override with any target specific settings + TTI.getStripMiningPreferences(L, SE, SMP); + + // Apply any user values specified by cl::opt + if (UserCount) + SMP.Count = *UserCount; + if (StripMineUnrollRemainder.getNumOccurrences() > 0) + SMP.UnrollRemainder = StripMineUnrollRemainder; + + return SMP; +} + +// If loop has an grainsize pragma return the (necessarily positive) value from +// the pragma for stripmining. Otherwise return 0. +static unsigned StripMineCountPragmaValue(const Loop *L) { + TapirLoopHints Hints(L); + return Hints.getGrainsize(); +} + +// Returns true if stripmine count was set explicitly. +// Calculates stripmine count and writes it to SMP.Count. +bool llvm::computeStripMineCount( + Loop *L, const TargetTransformInfo &TTI, InstructionCost LoopCost, + TargetTransformInfo::StripMiningPreferences &SMP) { + // Check for explicit Count. + // 1st priority is stripmine count set by "stripmine-count" option. + bool UserStripMineCount = StripMineCount.getNumOccurrences() > 0; + if (UserStripMineCount) { + SMP.Count = StripMineCount; + SMP.AllowExpensiveTripCount = true; + return true; + } + + // 2nd priority is stripmine count set by pragma. + unsigned PragmaCount = StripMineCountPragmaValue(L); + if (PragmaCount > 0) { + SMP.Count = PragmaCount; + SMP.AllowExpensiveTripCount = true; + return true; + } + + // 3rd priority is computed stripmine count. + // + // We want to coarsen the loop such that the work of detaching a loop + // iteration is tiny compared to the work of the loop body. Specifically, we + // want the total cost of the parallel loop to be at most (1 + \eps) times the + // cost of its serial projection. Let G is the grainsize, n the number of + // loop iterations, d the cost of a detach, and S the work of the loop body. + // Then we want + // + // (n/G)(G*S + d) <= (1 + \eps)(n * S) + // + // Solving for G yeilds G >= d/(\eps * S). Substituting in \eps = 1/C for a + // given coarsening factor C gives the equation below. + Instruction *DetachI = L->getHeader()->getTerminator(); + SMP.Count = *((SMP.DefaultCoarseningFactor * + TTI.getInstructionCost( + DetachI, TargetTransformInfo::TCK_SizeAndLatency) / + LoopCost) + .getValue()); + + return false; +} + +static Task *getTapirLoopForStripMining(const Loop *L, TaskInfo &TI, + OptimizationRemarkEmitter *ORE) { + LLVM_DEBUG(dbgs() << "Analyzing for stripmining: " << *L); + // We only handle Tapir loops. + Task *T = getTaskIfTapirLoopStructure(L, &TI); + if (!T) + return nullptr; + + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) { + LLVM_DEBUG(dbgs() + << " Can't stripmine: loop preheader-insertion failed.\n"); + if (ORE) + ORE->emit(TapirLoopInfo::createMissedAnalysis(LSM_NAME, "NoPreheader", L) + << "loop lacks a preheader"); + return nullptr; + } + assert(isa(Preheader->getTerminator()) && + "Preheader not terminated by a branch"); + + BasicBlock *LatchBlock = L->getLoopLatch(); + if (!LatchBlock) { + LLVM_DEBUG(dbgs() + << " Can't stripmine: loop exit-block-insertion failed.\n"); + if (ORE) + ORE->emit(TapirLoopInfo::createMissedAnalysis(LSM_NAME, "NoLatch", L) + << "loop lacks a latch"); + return nullptr; + } + + // Loops with indirectbr cannot be cloned. + if (!L->isSafeToClone()) { + LLVM_DEBUG(dbgs() << " Can't stripmine: loop body cannot be cloned.\n"); + if (ORE) + ORE->emit(TapirLoopInfo::createMissedAnalysis(LSM_NAME, "UnsafeToClone", + L) + << "loop is not safe to clone"); + return nullptr; + } + + // Tapir loops where the loop body does not reattach cannot be stripmined. + if (!llvm::any_of(predecessors(LatchBlock), [](const BasicBlock *B) { + return isa(B->getTerminator()); + })) { + LLVM_DEBUG(dbgs() << " Can't stripmine: loop body does not reattach.\n"); + if (ORE) + ORE->emit(TapirLoopInfo::createMissedAnalysis(LSM_NAME, "NoReattach", L) + << "spawned loop body does not reattach"); + return nullptr; + } + + // The current loop-stripmine pass can only stripmine loops with a single + // latch that's a conditional branch exiting the loop. + // FIXME: The implementation can be extended to work with more complicated + // cases, e.g. loops with multiple latches. + BranchInst *BI = dyn_cast(LatchBlock->getTerminator()); + + if (!BI || BI->isUnconditional()) { + // The loop-rotate pass can be helpful to avoid this in many cases. + LLVM_DEBUG( + dbgs() + << " Can't stripmine: loop not terminated by a conditional branch.\n"); + if (ORE) + ORE->emit(TapirLoopInfo::createMissedAnalysis(LSM_NAME, "NoLatchBranch", + L) + << "loop latch is not terminated by a conditional branch"); + return nullptr; + } + + BasicBlock *Header = L->getHeader(); + auto CheckSuccessors = [&](unsigned S1, unsigned S2) { + return BI->getSuccessor(S1) == Header && !L->contains(BI->getSuccessor(S2)); + }; + + if (!CheckSuccessors(0, 1) && !CheckSuccessors(1, 0)) { + LLVM_DEBUG(dbgs() << " Can't stripmine: only loops with one conditional" + " latch exiting the loop can be stripmined.\n"); + if (ORE) + ORE->emit(TapirLoopInfo::createMissedAnalysis(LSM_NAME, + "ComplexLatchBranch", L) + << "loop has multiple exiting conditional latches"); + return nullptr; + } + + if (Header->hasAddressTaken()) { + // The loop-rotate pass can be helpful to avoid this in many cases. + LLVM_DEBUG( + dbgs() << " Won't stripmine loop: address of header block is " + "taken.\n"); + if (ORE) + ORE->emit(TapirLoopInfo::createMissedAnalysis(LSM_NAME, + "HeaderAddressTaken", L) + << "loop header block has address taken"); + return nullptr; + } + + // Don't stripmine loops with the convergent attribute. + for (auto &BB : L->blocks()) + for (auto &I : *BB) + if (CallBase *CB = dyn_cast(&I)) + if (CB->isConvergent()) { + LLVM_DEBUG( + dbgs() << " Won't stripmine loop: contains convergent " + "attribute.\n"); + if (ORE) + ORE->emit(TapirLoopInfo::createMissedAnalysis(LSM_NAME, + "ConvergentLoop", L) + << "loop contains convergent attribute"); + return nullptr; + } + + // TODO: Generalize this condition to support stripmining with a prolog. +#ifndef NDEBUG + if (!isEpilogProfitable(L)) { + dbgs() << "Stripmining loop with unprofitable epilog.\n"; + } +#endif + + // Get the task for this loop. + return T; +} + +/// Connect the stripmining epilog code to the original loop. +/// The stripmining epilog code contains code to execute the +/// 'extra' iterations if the run-time trip count modulo the +/// stripmine count is non-zero. +/// +/// This function performs the following: +/// - Update PHI operands in the epilog loop by the new PHI nodes +/// - Branch around the epilog loop if extra iters (ModVal) is zero. +/// +static void ConnectEpilog(TapirLoopInfo &TL, Value *EpilStartIter, + Value *ModVal, BasicBlock *LoopDet, + BasicBlock *LoopEnd, BasicBlock *NewExit, + BasicBlock *Exit, BasicBlock *Preheader, + BasicBlock *EpilogPreheader, ValueToValueMapTy &VMap, + DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, + const DataLayout &DL, bool PreserveLCSSA) { + // NewExit should contain no PHI nodes. +#ifndef NDEBUG + bool ContainsPHIs = false; + for (PHINode &PN : NewExit->phis()) { + dbgs() << "NewExit PHI node: " << PN << "\n"; + ContainsPHIs = true; + } + assert(!ContainsPHIs && "NewExit should not contain PHI nodes."); +#endif + + // Create PHI nodes at NewExit (from the stripmining loop Latch and + // Preheader). Update corresponding PHI nodes in epilog loop. + IRBuilder<> B(EpilogPreheader->getTerminator()); + for (auto &InductionEntry : *TL.getInductionVars()) { + // Compute the value of this induction at NewExit. + const InductionDescriptor &II = InductionEntry.second; + // Get the new step value for this Phi. + Value *PhiIter = !II.getStep()->getType()->isIntegerTy() + ? B.CreateCast(Instruction::SIToFP, EpilStartIter, + II.getStep()->getType()) + : B.CreateSExtOrTrunc(EpilStartIter, II.getStep()->getType()); + Value *NewPhiStart = emitTransformedIndex(B, PhiIter, SE, DL, II); + + // Update the PHI node in the epilog loop. + PHINode *PN = cast(VMap[InductionEntry.first]); + PN->setIncomingValue(PN->getBasicBlockIndex(EpilogPreheader), NewPhiStart); + } + + Instruction *InsertPt = NewExit->getTerminator(); + B.SetInsertPoint(InsertPt); + Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod"); + assert(Exit && "Loop must have a single exit block only"); + // Split the epilogue exit to maintain loop canonicalization guarantees + SmallVector Preds(predecessors(Exit)); + SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, nullptr, + PreserveLCSSA); + // Add the branch to the exit block (around the stripmining loop) + B.CreateCondBr(BrLoopExit, EpilogPreheader, Exit); + InsertPt->eraseFromParent(); + if (DT) + DT->changeImmediateDominator(Exit, NewExit); + + // Split the main loop exit to maintain canonicalization guarantees. + SmallVector NewExitPreds{LoopDet}; + if (LoopEnd != NewExit) + NewExitPreds.push_back(LoopEnd); + SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI, nullptr, + PreserveLCSSA); +} + +/// Create a clone of the blocks in a loop and connect them together. +/// If CreateRemainderLoop is false, loop structure will not be cloned, +/// otherwise a new loop will be created including all cloned blocks, and the +/// iterator of it switches to count NewIter down to 0. +/// The cloned blocks should be inserted between InsertTop and InsertBot. +/// If loop structure is cloned InsertTop should be new preheader, InsertBot +/// new loop exit. +/// Return the new cloned loop that is created when CreateRemainderLoop is true. +static Loop * +CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop, + const bool UseEpilogRemainder, const bool UnrollRemainder, + BasicBlock *InsertTop, BasicBlock *InsertBot, + BasicBlock *Preheader, std::vector &NewBlocks, + LoopBlocksDFS &LoopBlocks, + SmallVectorImpl &ExtraTaskBlocks, + SmallVectorImpl &SharedEHTaskBlocks, + ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) { + StringRef suffix = UseEpilogRemainder ? "epil" : "prol"; + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + Function *F = Header->getParent(); + LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO(); + LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); + Loop *ParentLoop = L->getParentLoop(); + NewLoopsMap NewLoops; + NewLoops[ParentLoop] = ParentLoop; + if (!CreateRemainderLoop) + NewLoops[L] = ParentLoop; + + // For each block in the original loop, create a new copy, + // and update the value map with the newly created values. + for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { + BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F); + NewBlocks.push_back(NewBB); + + // Add the cloned block to loop info. + addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops); + + VMap[*BB] = NewBB; + if (Header == *BB) { + // For the first block, add a CFG connection to this newly + // created block. + InsertTop->getTerminator()->setSuccessor(0, NewBB); + } + + if (DT) { + if (Header == *BB) { + // The header is dominated by the preheader. + DT->addNewBlock(NewBB, InsertTop); + } else { + // Copy information from original loop to the clone. + BasicBlock *IDomBB = DT->getNode(*BB)->getIDom()->getBlock(); + DT->addNewBlock(NewBB, cast(VMap[IDomBB])); + } + } + + if (Latch == *BB) { + // For the last block, if CreateRemainderLoop is false, create a direct + // jump to InsertBot. If not, create a loop back to cloned head. + VMap.erase((*BB)->getTerminator()); + BasicBlock *FirstLoopBB = cast(VMap[Header]); + BranchInst *LatchBR = cast(NewBB->getTerminator()); + IRBuilder<> Builder(LatchBR); + if (!CreateRemainderLoop) { + Builder.CreateBr(InsertBot); + } else { + PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, + suffix + ".iter", + FirstLoopBB->getFirstNonPHI()); + Value *IdxSub = + Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1), + NewIdx->getName() + ".sub"); + Value *IdxCmp = + Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp"); + Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot); + NewIdx->addIncoming(NewIter, InsertTop); + NewIdx->addIncoming(IdxSub, NewBB); + } + LatchBR->eraseFromParent(); + } + } + + DetachInst *DI = cast(Header->getTerminator()); + // Create new copies of the EH blocks to clone. We can handle these blocks + // more simply than the loop blocks. + for (BasicBlock *BB : ExtraTaskBlocks) { + BasicBlock *NewBB = CloneBasicBlock(BB, VMap, "." + suffix, F); + NewBlocks.push_back(NewBB); + + // Add the cloned block to loop info. + if (LI->getLoopFor(BB)) + addClonedBlockToLoopInfo(BB, NewBB, LI, NewLoops); + + VMap[BB] = NewBB; + + // Update PHI nodes in the detach-unwind destination. Strictly speaking, + // this step isn't necessary, since the epilog loop will be serialized later + // and these new entries for the PHI nodes will therefore be removed. But + // the routine for serializing the detach expects valid LLVM, so we update + // the PHI nodes here to ensure the resulting LLVM is valid. + if (DI->hasUnwindDest()) { + if (isDetachedRethrow(BB->getTerminator(), DI->getSyncRegion())) { + InvokeInst *DR = dyn_cast(BB->getTerminator()); + for (PHINode &PN : DR->getUnwindDest()->phis()) + PN.addIncoming(PN.getIncomingValueForBlock(BB), NewBB); + } + } + } + + // Update PHI nodes in successors of ExtraTaskBlocks, based on the cloned + // values. + for (BasicBlock *BB : ExtraTaskBlocks) { + for (BasicBlock *Succ : successors(BB)) { + if (VMap.count(Succ)) + continue; + + for (PHINode &PN : Succ->phis()) { + Value *Val = PN.getIncomingValueForBlock(BB); + Value *NewVal = VMap.count(Val) ? cast(VMap[Val]) : Val; + PN.addIncoming(NewVal, cast(VMap[BB])); + } + } + } + + // Update DT to accommodate cloned ExtraTaskBlocks. + if (DT) { + for (BasicBlock *BB : ExtraTaskBlocks) { + BasicBlock *NewBB = cast(VMap[BB]); + // Copy information from original loop to the clone, if it's available. + BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock(); + if (VMap.count(IDomBB)) { + DT->addNewBlock(NewBB, cast(VMap[IDomBB])); + } else { + BasicBlock *NewIDom = nullptr; + // Get the idom of BB's predecessors. + for (BasicBlock *Pred : predecessors(BB)) + if (VMap.count(Pred)) { + if (NewIDom) + NewIDom = DT->findNearestCommonDominator(NewIDom, Pred); + else + NewIDom = Pred; + } + // Use this computed idom (or its clone) as the idom of the cloned BB. + if (VMap.count(NewIDom)) + DT->addNewBlock(NewBB, cast(VMap[NewIDom])); + else + DT->addNewBlock(NewBB, NewIDom); + } + } + } + + // Change the incoming values to the ones defined in the preheader or + // cloned loop. + for (BasicBlock::iterator I = Header->begin(); isa(I); ++I) { + PHINode *NewPHI = cast(VMap[&*I]); + if (!CreateRemainderLoop) { + if (UseEpilogRemainder) { + unsigned idx = NewPHI->getBasicBlockIndex(Preheader); + NewPHI->setIncomingBlock(idx, InsertTop); + NewPHI->removeIncomingValue(Latch, false); + } else { + VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader); + NewPHI->eraseFromParent(); + } + } else { + unsigned idx = NewPHI->getBasicBlockIndex(Preheader); + NewPHI->setIncomingBlock(idx, InsertTop); + BasicBlock *NewLatch = cast(VMap[Latch]); + idx = NewPHI->getBasicBlockIndex(Latch); + Value *InVal = NewPHI->getIncomingValue(idx); + NewPHI->setIncomingBlock(idx, NewLatch); + if (Value *V = VMap.lookup(InVal)) + NewPHI->setIncomingValue(idx, V); + } + } + + // Add entries to PHI nodes outside of loop. Strictly speaking, this step + // isn't necessary, since the epilog loop will be serialized later and these + // new entries for the PHI nodes will therefore be removed. But the routine + // for serializing the detach expects valid LLVM, so we update the PHI nodes + // here to ensure the resulting LLVM is valid. + BasicBlock *ClonedHeader = cast(VMap[Header]); + DetachInst *ClonedDetach = cast(ClonedHeader->getTerminator()); + if (BasicBlock *Unwind = ClonedDetach->getUnwindDest()) + for (PHINode &PN : Unwind->phis()) + PN.addIncoming(PN.getIncomingValueForBlock(Header), ClonedHeader); + + if (CreateRemainderLoop) { + Loop *NewLoop = NewLoops[L]; + assert(NewLoop && "L should have been cloned"); + + // Only add loop metadata if the loop is not going to be completely + // unrolled. + if (UnrollRemainder) + return NewLoop; + + // FIXME? + // // Add unroll disable metadata to disable future unrolling for this loop. + // NewLoop->setLoopAlreadyUnrolled(); + return NewLoop; + } + else + return nullptr; +} + +// Helper function to get the basic-block predecessors of the given exceptional +// continuation BB associated with task T. These predecessors are either +// enclosed by task T or come from the unwind of the detach that spawns T. +// +// TODO: Move some of this logic into TapirTaskInfo, so we don't have to +// recompute it? +static void getEHContPredecessors(BasicBlock *BB, Task *T, + SmallVectorImpl &Preds, + TaskInfo &TI) { + DetachInst *DI = T->getDetach(); + assert(DI && "Root task does not have an exceptional continuation."); + assert(DI->hasUnwindDest() && + "Task does not have an exceptional continuation."); + + // Get the predecessors of BB enclosed by task T. + for (BasicBlock *Pred : predecessors(BB)) + if (T->encloses(Pred)) + Preds.push_back(Pred); + + // If the unwind destination of the detach is the exceptional continuation BB, + // add the block that performs the detach and return. + if (DI->getUnwindDest() == BB) { + Preds.push_back(DI->getParent()); + return; + } + + // Get the predecessor that comes from the unwind of the detach. + BasicBlock *DetUnwind = DI->getUnwindDest(); + while (DetUnwind->getUniqueSuccessor() != BB) + DetUnwind = DetUnwind->getUniqueSuccessor(); + Preds.push_back(DetUnwind); +} + +// Helper method to nest the exception-handling code of a task with exceptional +// continuation EHCont within a new parent task. +static BasicBlock *NestDetachUnwindPredecessors( + BasicBlock *EHCont, Value *EHContLPad, ArrayRef Preds, + BasicBlock *NewDetachBB, const char *Suffix1, const char *Suffix2, + LandingPadInst *OrigLPad, Value *SyncReg, Module *M, DominatorTree *DT, + LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA) { + BasicBlock *InnerUD, *OuterUD; + Value *InnerUDLPad; + Type *OrigLPadTy = OrigLPad->getType(); + if (EHCont->isLandingPad()) { + SmallVector NewBBs; + SplitLandingPadPredecessors(EHCont, Preds, Suffix1, Suffix2, NewBBs, DT, LI, + MSSAU, PreserveLCSSA); + InnerUD = NewBBs[0]; + OuterUD = NewBBs[1]; + InnerUDLPad = InnerUD->getLandingPadInst(); + + // Remove InnerUD from the PHI nodes in EHCont. + for (PHINode &PN : EHCont->phis()) + PN.removeIncomingValue(InnerUD); + } else { + // Split the given Task predecessors of EHCont, which are given in Preds. + InnerUD = SplitBlockPredecessors(EHCont, Preds, Suffix1, DT, LI, MSSAU, + PreserveLCSSA); + // Split the NewDetachBB predecessor of EHCont. + OuterUD = SplitBlockPredecessors(EHCont, {NewDetachBB}, Suffix2, DT, LI, + MSSAU, PreserveLCSSA); + + // Create a new landing pad for the outer detach by cloning the landing pad + // from the old detach-unwind destination. + Instruction *Clone = OrigLPad->clone(); + Clone->setName(Twine("lpad") + Suffix2); + Clone->insertInto(OuterUD, OuterUD->getFirstInsertionPt()); + + // Update the PHI nodes in EHCont to accommodate OuterUD. If the PHI node + // corresponds to the EHCont landingpad value, set its incoming value from + // OuterUD to be the new landingpad. For all other PHI nodes, use the + // incoming value associated with InnerUD. + Value *OuterUDTmpVal = nullptr; + for (PHINode &PN : EHCont->phis()) { + if (&PN == EHContLPad) { + int OuterUDIdx = PN.getBasicBlockIndex(OuterUD); + OuterUDTmpVal = PN.getIncomingValue(OuterUDIdx); + PN.setIncomingValue(OuterUDIdx, Clone); + } else + PN.setIncomingValue(PN.getBasicBlockIndex(OuterUD), + PN.getIncomingValueForBlock(InnerUD)); + } + + if (Instruction *OuterUDTmpInst = dyn_cast(OuterUDTmpVal)) { + // Remove the temporary value for the new detach's unwind. + assert(OuterUDTmpInst->hasNUses(0) && + "Unexpected uses of a detach-unwind temporary value."); + OuterUDTmpInst->eraseFromParent(); + } + + // Remove InnerUD from the PHI nodes in EHCont. Record the value of the + // EHCont landingpad that comes from InnerUD. + InnerUDLPad = EHContLPad; + for (PHINode &PN : EHCont->phis()) { + if (&PN == EHContLPad) + InnerUDLPad = PN.getIncomingValueForBlock(InnerUD); + PN.removeIncomingValue(InnerUD); + } + } + + // Replace the termination of InnerUD with a detached rethrow. Start by + // creating a block for the unreachable destination of the detached rethrow. + BasicBlock *NewUnreachable = + SplitBlock(InnerUD, InnerUD->getTerminator(), DT, LI); + NewUnreachable->setName(InnerUD->getName() + ".unreachable"); + + // Insert a detached rethrow to the end of InnerUD. NewUnreachable is the + // normal destination of this detached rethrow, and OuterUD is the unwind + // destination. + ReplaceInstWithInst( + InnerUD->getTerminator(), + InvokeInst::Create(Intrinsic::getDeclaration( + M, Intrinsic::detached_rethrow, {OrigLPadTy}), + NewUnreachable, OuterUD, {SyncReg, InnerUDLPad})); + + // Terminate NewUnreachable with an unreachable. + IRBuilder<> B(NewUnreachable->getTerminator()); + Instruction *UnreachableTerm = cast(B.CreateUnreachable()); + UnreachableTerm->setDebugLoc(NewUnreachable->getTerminator()->getDebugLoc()); + NewUnreachable->getTerminator()->eraseFromParent(); + + // Inform the dominator tree of the deleted edge + if (DT) + DT->deleteEdge(NewUnreachable, EHCont); + + return OuterUD; +} + +Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, + bool UnrollRemainder, LoopInfo *LI, + ScalarEvolution *SE, DominatorTree *DT, + const TargetTransformInfo &TTI, AssumptionCache *AC, + TaskInfo *TI, OptimizationRemarkEmitter *ORE, + bool PreserveLCSSA, bool ParallelEpilog, + bool NeedNestedSync, Loop **RemainderLoop) { + Task *T = getTapirLoopForStripMining(L, *TI, ORE); + if (!T) + return nullptr; + + TapirLoopInfo TL(L, T); + + // TODO: Add support for loop peeling, i.e., using a prolog. + + // Use Scalar Evolution to compute the trip count. This allows more loops to + // be stripmined than relying on induction var simplification. + if (!SE) + return nullptr; + PredicatedScalarEvolution PSE(*SE, *L); + + TL.collectIVs(PSE, LSM_NAME, ORE); + + // If no primary induction was found, just bail. + if (!TL.hasPrimaryInduction()) { + LLVM_DEBUG(dbgs() << "No primary induction variable found in loop."); + return nullptr; + } + PHINode *PrimaryInduction = TL.getPrimaryInduction().first; + LLVM_DEBUG(dbgs() << "\tPrimary induction " << *PrimaryInduction << "\n"); + + Value *TripCount = TL.getOrCreateTripCount(PSE, LSM_NAME, ORE); + if (!TripCount) { + LLVM_DEBUG(dbgs() << "Could not compute trip count.\n"); + if (ORE) + ORE->emit(TapirLoopInfo::createMissedAnalysis(LSM_NAME, "NoTripCount", L) + << "could not compute finite loop trip count."); + return nullptr; + } + + LLVM_DEBUG(dbgs() << "\tTrip count " << *TripCount << "\n"); + + // Fixup all external uses of the IVs. + for (auto &InductionEntry : *TL.getInductionVars()) + TL.fixupIVUsers(InductionEntry.first, InductionEntry.second, PSE); + + // High-level algorithm: Generate an epilog for the Tapir loop and insert it + // between the original latch and its exit. Then split the entry and reattach + // block of the loop body to build the serial inner loop. + + BasicBlock *Preheader = L->getLoopPreheader(); + BranchInst *PreheaderBR = cast(Preheader->getTerminator()); + BasicBlock *Latch = L->getLoopLatch(); + BasicBlock *Header = L->getHeader(); + BasicBlock *TaskEntry = T->getEntry(); + assert(isa(Header->getTerminator()) && + "Header not terminated by a detach."); + DetachInst *DI = cast(Header->getTerminator()); + assert(DI->getDetached() == TaskEntry && + "Task entry does not match block detached from header."); + BasicBlock *ParentEntry = T->getParentTask()->getEntry(); + BranchInst *LatchBR = cast(Latch->getTerminator()); + unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0; + BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex); + + // We will use the increment of the primary induction variable to derive + // wrapping flags. + Instruction *PrimaryInc = + cast(PrimaryInduction->getIncomingValueForBlock(Latch)); + + // Get all uses of the primary induction variable in the task. + SmallVector PrimaryInductionUsesInTask; + for (Use &U : PrimaryInduction->uses()) + if (Instruction *User = dyn_cast(U.getUser())) + if (T->encloses(User->getParent())) + PrimaryInductionUsesInTask.push_back(&U); + + // Only stripmine loops with a computable trip count, and the trip count needs + // to be an int value (allowing a pointer type is a TODO item). + // We calculate the backedge count by using getExitCount on the Latch block, + // which is proven to be the only exiting block in this loop. This is same as + // calculating getBackedgeTakenCount on the loop (which computes SCEV for all + // exiting blocks). + const SCEV *BECountSC = TL.getBackedgeTakenCount(PSE); + if (isa(BECountSC) || + !BECountSC->getType()->isIntegerTy()) { + LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n"); + return nullptr; + } + + unsigned BEWidth = + cast(TL.getWidestInductionType())->getBitWidth(); + + // Add 1 since the backedge count doesn't include the first loop iteration. + const SCEV *TripCountSC = TL.getExitCount(BECountSC, PSE); + if (isa(TripCountSC)) { + LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n"); + return nullptr; + } + + const DataLayout &DL = Header->getModule()->getDataLayout(); + SCEVExpander Expander(*SE, DL, "loop-stripmine"); + if (!AllowExpensiveTripCount && + Expander.isHighCostExpansion(TripCountSC, L, SCEVCheapExpansionBudget, + &TTI, PreheaderBR)) { + LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n"); + return nullptr; + } + + // This constraint lets us deal with an overflowing trip count easily; see the + // comment on ModVal below. + if (Log2_32(Count) > BEWidth) { + LLVM_DEBUG( + dbgs() + << "Count failed constraint on overflow trip count calculation.\n"); + return nullptr; + } + + LLVM_DEBUG(dbgs() << "Stripmining loop using grainsize " << Count << "\n"); + using namespace ore; + ORE->emit([&]() { + return OptimizationRemark(LSM_NAME, "Stripmined", + L->getStartLoc(), L->getHeader()) + << "stripmined loop using count " + << NV("StripMineCount", Count); + }); + + // Loop structure is the following: + // + // Preheader + // Header + // ... + // Latch + // LatchExit + + // Insert the epilog remainder. + BasicBlock *NewPreheader; + BasicBlock *NewExit = nullptr; + BasicBlock *EpilogPreheader = nullptr; + { + // Split Preheader to insert a branch around loop for stripmining. + NewPreheader = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI); + NewPreheader->setName(Preheader->getName() + ".new"); + // Split LatchExit to create phi nodes from branch above. + SmallVector Preds(predecessors(LatchExit)); + NewExit = SplitBlockPredecessors(LatchExit, Preds, ".strpm-lcssa", + DT, LI, nullptr, PreserveLCSSA); + // NewExit gets its DebugLoc from LatchExit, which is not part of the + // original Loop. + // Fix this by setting Loop's DebugLoc to NewExit. + auto *NewExitTerminator = NewExit->getTerminator(); + NewExitTerminator->setDebugLoc(Header->getTerminator()->getDebugLoc()); + // Split NewExit to insert epilog remainder loop. + EpilogPreheader = SplitBlock(NewExit, NewExitTerminator, DT, LI); + EpilogPreheader->setName(Header->getName() + ".epil.preheader"); + } + + // Calculate conditions for branch around loop for stripmining + // in epilog case and around prolog remainder loop in prolog case. + // Compute the number of extra iterations required, which is: + // extra iterations = run-time trip count % loop stripmine factor + PreheaderBR = cast(Preheader->getTerminator()); + Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(), + PreheaderBR); + + // Loop structure should be the following: + // Epilog + // + // Preheader + // *NewPreheader + // Header + // ... + // Latch + // *NewExit + // *EpilogPreheader + // LatchExit + + IRBuilder<> B(PreheaderBR); + Value *ModVal; + // Calculate ModVal = (BECount + 1) % Count. + // Note that TripCount is BECount + 1. + if (isPowerOf2_32(Count)) { + // When Count is power of 2 we don't BECount for epilog case. However we'll + // need it for a branch around stripmined loop for prolog case. + ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); + // 1. There are no iterations to be run in the prolog/epilog loop. + // OR + // 2. The addition computing TripCount overflowed. + // + // If (2) is true, we know that TripCount really is (1 << BEWidth) and so + // the number of iterations that remain to be run in the original loop is a + // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we + // explicitly check this above). + if (TL.isInclusiveRange()) + ModVal = B.CreateAdd(ModVal, ConstantInt::get(ModVal->getType(), 1)); + } else { + // As (BECount + 1) can potentially unsigned overflow we count + // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count. + Value *ModValTmp = B.CreateURem(BECount, + ConstantInt::get(BECount->getType(), + Count)); + Value *ModValAdd = B.CreateAdd(ModValTmp, + ConstantInt::get(ModValTmp->getType(), 1)); + // At that point (BECount % Count) + 1 could be equal to Count. + // To handle this case we need to take mod by Count one more time. + ModVal = B.CreateURem(ModValAdd, + ConstantInt::get(BECount->getType(), Count), + "xtraiter"); + } + Value *BranchVal = B.CreateICmpULT( + BECount, ConstantInt::get(BECount->getType(), + TL.isInclusiveRange() ? Count : Count - 1)); + BasicBlock *RemainderLoopBB = NewExit; + BasicBlock *StripminedLoopBB = NewPreheader; + // Branch to either remainder (extra iterations) loop or stripmined loop. + B.CreateCondBr(BranchVal, RemainderLoopBB, StripminedLoopBB); + PreheaderBR->eraseFromParent(); + if (DT) { + // if (UseEpilogRemainder) + DT->changeImmediateDominator(NewExit, Preheader); + // else + // DT->changeImmediateDominator(PrologExit, Preheader); + } + Function *F = Header->getParent(); + // Get an ordered list of blocks in the loop to help with the ordering of the + // cloned blocks in the prolog/epilog code + LoopBlocksDFS LoopBlocks(L); + LoopBlocks.perform(LI); + + // Collect extra blocks in the task that LoopInfo does not consider to be part + // of the loop, e.g., exception-handling code for the task. + SmallVector ExtraTaskBlocks; + SmallVector SharedEHTaskBlocks; + SmallPtrSet SharedEHBlockPreds; + { + SmallPtrSet Visited; + for (Task *SubT : depth_first(T)) { + for (Spindle *S : + depth_first>(SubT->getEntrySpindle())) { + // Only visit shared-eh spindles once a piece. + if (S->isSharedEH() && !Visited.insert(S).second) + continue; + + for (BasicBlock *BB : S->blocks()) { + // Skip blocks in the loop. + if (!L->contains(BB)) { + ExtraTaskBlocks.push_back(BB); + + if (!T->simplyEncloses(BB) && S->isSharedEH()) { + SharedEHTaskBlocks.push_back(BB); + if (S->getEntry() == BB) + for (BasicBlock *Pred : predecessors(BB)) + if (T->simplyEncloses(Pred)) + SharedEHBlockPreds.insert(Pred); + } + } + } + } + } + } + + SmallVector Reattaches; + SmallVector EHBlocksToClone; + SmallPtrSet EHBlockPreds; + SmallPtrSet InlinedLPads; + SmallVector DetachedRethrows; + // Analyze the original task for serialization. + AnalyzeTaskForSerialization(T, Reattaches, EHBlocksToClone, EHBlockPreds, + InlinedLPads, DetachedRethrows); + bool NeedToInsertTaskFrame = taskContainsSync(T); + + // If this detach can throw, get the exceptional continuation of the detach + // and its associated landingpad value. + BasicBlock *EHCont = nullptr; + Value *EHContLPadVal = nullptr; + SmallVector UDPreds; + if (DI->hasUnwindDest()) { + EHCont = T->getEHContinuationSpindle()->getEntry(); + EHContLPadVal = T->getLPadValueInEHContinuationSpindle(); + getEHContPredecessors(EHCont, T, UDPreds, *TI); + } + + // For each extra loop iteration, create a copy of the loop's basic blocks + // and generate a condition that branches to the copy depending on the + // number of 'left over' iterations. + // + std::vector NewBlocks; + ValueToValueMapTy VMap; + + // TODO: For stripmine factor 2 remainder loop will have 1 iterations. + // Do not create 1 iteration loop. + // bool CreateRemainderLoop = (Count != 2); + bool CreateRemainderLoop = true; + + // Clone all the basic blocks in the loop. If Count is 2, we don't clone + // the loop, otherwise we create a cloned loop to execute the extra + // iterations. This function adds the appropriate CFG connections. + BasicBlock *InsertBot = LatchExit; + BasicBlock *InsertTop = EpilogPreheader; + *RemainderLoop = + CloneLoopBlocks(L, ModVal, CreateRemainderLoop, true, UnrollRemainder, + InsertTop, InsertBot, NewPreheader, NewBlocks, LoopBlocks, + ExtraTaskBlocks, SharedEHTaskBlocks, VMap, DT, LI); + + // Insert the cloned blocks into the function. + F->splice(InsertBot->getIterator(), &*F, NewBlocks[0]->getIterator(), + F->end()); + + // Loop structure should be the following: + // Epilog + // + // Preheader + // NewPreheader + // Header + // ... + // Latch + // NewExit + // EpilogPreheader + // EpilogHeader + // ... + // EpilogLatch + // LatchExit + + // Rewrite the cloned instruction operands to use the values created when the + // clone is created. + for (BasicBlock *BB : NewBlocks) + for (Instruction &I : *BB) + RemapInstruction(&I, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + + // Serialize the cloned loop body to render the inner loop serial. + { + // Translate all the analysis for the new cloned task. + SmallVector ClonedReattaches; + for (Instruction *I : Reattaches) + ClonedReattaches.push_back(cast(VMap[I])); + SmallPtrSet ClonedEHBlockPreds; + for (BasicBlock *B : EHBlockPreds) + ClonedEHBlockPreds.insert(cast(VMap[B])); + SmallVector ClonedEHBlocks; + for (BasicBlock *B : EHBlocksToClone) + ClonedEHBlocks.push_back(cast(VMap[B])); + // Landing pads and detached-rethrow instructions may or may not have been + // cloned. + SmallPtrSet ClonedInlinedLPads; + for (LandingPadInst *LPad : InlinedLPads) { + if (VMap[LPad]) + ClonedInlinedLPads.insert(cast(VMap[LPad])); + else + ClonedInlinedLPads.insert(LPad); + } + SmallVector ClonedDetachedRethrows; + for (Instruction *DR : DetachedRethrows) { + if (VMap[DR]) + ClonedDetachedRethrows.push_back(cast(VMap[DR])); + else + ClonedDetachedRethrows.push_back(DR); + } + DetachInst *ClonedDI = cast(VMap[DI]); + // Serialize the new task. + SerializeDetach(ClonedDI, ParentEntry, EHCont, EHContLPadVal, + ClonedReattaches, &ClonedEHBlocks, &ClonedEHBlockPreds, + &ClonedInlinedLPads, &ClonedDetachedRethrows, + NeedToInsertTaskFrame, DT, LI); + } + + // Detach the stripmined loop. + Value *SyncReg = DI->getSyncRegion(), *NewSyncReg; + BasicBlock *EpilogPred, *LoopDetEntry, *LoopReattach; + Module *M = F->getParent(); + if (ParallelEpilog) { + ORE->emit([&]() { + return OptimizationRemark(LSM_NAME, "ParallelEpil", + L->getStartLoc(), L->getHeader()) + << "allowing epilog to execute in parallel with stripmined " + << "loop"; + }); + BasicBlock *LoopDetach = SplitBlock(NewPreheader, + NewPreheader->getTerminator(), DT, LI); + LoopDetach->setName(NewPreheader->getName() + ".strpm.detachloop"); + { + SmallVector HeaderPreds; + for (BasicBlock *Pred : predecessors(Header)) + if (Pred != Latch) + HeaderPreds.push_back(Pred); + LoopDetEntry = + SplitBlockPredecessors(Header, HeaderPreds, ".strpm.detachloop.entry", + DT, LI, nullptr, PreserveLCSSA); + NewSyncReg = CallInst::Create( + Intrinsic::getDeclaration(M, Intrinsic::syncregion_start), {}, + &*LoopDetEntry->getFirstInsertionPt()); + NewSyncReg->setName(SyncReg->getName() + ".strpm.detachloop"); + } + LoopReattach = SplitEdge(Latch, NewExit, DT, LI); + LoopReattach->setName(Header->getName() + ".strpm.detachloop.reattach"); + + // Clone any shared-EH spindles in the stripmined loop to prevent tasks at + // different nesting levels from sharing an EH spindle. + if (!SharedEHTaskBlocks.empty()) + cloneEHBlocks(F, SharedEHTaskBlocks, SharedEHBlockPreds, ".strpm", + nullptr, nullptr, DT, LI); + + // Insert new detach instructions + if (DI->hasUnwindDest()) { + // Insert a detach instruction to detach the stripmined loop. We do this + // early to simplify the operation of nesting the exception-handling code + // in the task. + ReplaceInstWithInst(LoopDetach->getTerminator(), + DetachInst::Create(LoopDetEntry, NewExit, + EHCont, SyncReg)); + // Update the dominator tree to reflect LoopDetach as a new predecessor of + // EHCont. + BasicBlock *OldIDom = DT->getNode(EHCont)->getIDom()->getBlock(); + DT->changeImmediateDominator( + EHCont, DT->findNearestCommonDominator(OldIDom, LoopDetach)); + // Update the PHIs in EHCont with temporary values from LoopDetach. These + // values will be fixed by NestDetachUnwindPredecessors. + for (PHINode &PN : EHCont->phis()) + PN.addIncoming(UndefValue::get(PN.getType()), LoopDetach); + + // Nest the exceptional code in the original task into the new task. + /* BasicBlock *OuterUD = */ NestDetachUnwindPredecessors( + EHCont, EHContLPadVal, UDPreds, LoopDetach, ".strpm", + ".strpm.detachloop.unwind", DI->getUnwindDest()->getLandingPadInst(), + SyncReg, M, DT, LI, nullptr, PreserveLCSSA); + + // Replace sync regions of existing detached-rethrows. + for (Instruction *I : DetachedRethrows) { + InvokeInst *II = cast(I); + II->setArgOperand(0, NewSyncReg); + } + } else { + // Insert a detach instruction to detach the stripmined loop. + ReplaceInstWithInst(LoopDetach->getTerminator(), + DetachInst::Create(LoopDetEntry, NewExit, SyncReg)); + LoopDetach->getTerminator()->setDebugLoc( + Header->getTerminator()->getDebugLoc()); + } + // Insert a reattach instruction after the detached stripmined loop. + ReplaceInstWithInst(LoopReattach->getTerminator(), + ReattachInst::Create(NewExit, SyncReg)); + LoopReattach->getTerminator()->setDebugLoc( + LoopDetach->getTerminator()->getDebugLoc()); + EpilogPred = LoopDetach; + } else { + NewSyncReg = SyncReg; + LoopReattach = NewExit; + LoopDetEntry = NewPreheader; + } + + // Get the set of new loop blocks + SetVector NewLoopBlocks; + { + LoopBlocksDFS NewLoopBlocksDFS(L); + NewLoopBlocksDFS.perform(LI); + LoopBlocksDFS::RPOIterator BlockBegin = NewLoopBlocksDFS.beginRPO(); + LoopBlocksDFS::RPOIterator BlockEnd = NewLoopBlocksDFS.endRPO(); + for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) + NewLoopBlocks.insert(*BB); + } + // Create structure in LI for new loop. + Loop *ParentLoop = L->getParentLoop(); + Loop *NewLoop = LI->AllocateLoop(); + if (ParentLoop) + ParentLoop->replaceChildLoopWith(L, NewLoop); + else + LI->changeTopLevelLoop(L, NewLoop); + NewLoop->addChildLoop(L); + + // Move the detach/reattach instructions to surround the stripmined loop. + BasicBlock *NewHeader; + { + SmallVector HeaderPreds; + for (BasicBlock *Pred : predecessors(Header)) + if (Pred != Latch) + HeaderPreds.push_back(Pred); + NewHeader = + SplitBlockPredecessors(Header, HeaderPreds, ".strpm.outer", + DT, LI, nullptr, PreserveLCSSA); + } + BasicBlock *NewEntry = + SplitBlock(NewHeader, NewHeader->getTerminator(), DT, LI); + NewEntry->setName(TaskEntry->getName() + ".strpm.outer"); + SmallVector LoopReattachPreds{Latch}; + BasicBlock *NewReattB = + SplitBlockPredecessors(LoopReattach, LoopReattachPreds, "", DT, LI, + nullptr, PreserveLCSSA); + NewReattB->setName(Latch->getName() + ".reattach"); + BasicBlock *NewLatch = + SplitBlock(NewReattB, NewReattB->getTerminator(), DT, LI); + NewLatch->setName(Latch->getName() + ".strpm.outer"); + + // Move static allocas from TaskEntry into NewEntry. + MoveStaticAllocasInBlock(NewEntry, TaskEntry, Reattaches); + + // Insert a new detach instruction + BasicBlock *OrigUnwindDest = DI->getUnwindDest(); + if (OrigUnwindDest) { + ReplaceInstWithInst(NewHeader->getTerminator(), + DetachInst::Create(NewEntry, NewLatch, + OrigUnwindDest, NewSyncReg)); + // Update the PHI nodes in the unwind destination of the detach. + for (PHINode &PN : OrigUnwindDest->phis()) + PN.setIncomingBlock(PN.getBasicBlockIndex(Header), NewHeader); + + // Update DT. Walk the path of unique successors from the unwind + // destination to change the immediate dominators of these nodes. Continue + // updating until OrigDUBB equals the exceptional continuation or, as in the + // case of a parallel epilog, we reach a detached-rethrow. + BasicBlock *OrigDUBB = OrigUnwindDest; + BasicBlock *NewDomCandidate = NewHeader; + if (ParallelEpilog && NeedNestedSync) + // We will insert a sync.unwind to OrigUnwindDest, which changes the + // dominator. + NewDomCandidate = + DT->findNearestCommonDominator(NewHeader, LoopReattach); + while (OrigDUBB && (OrigDUBB != EHCont)) { + BasicBlock *OldIDom = + DT->getNode(OrigDUBB)->getIDom()->getBlock(); + DT->changeImmediateDominator( + OrigDUBB, DT->findNearestCommonDominator(OldIDom, NewDomCandidate)); + // Get the next block along the path. If we reach the end of the path at + // a detached-rethrow, then getUniqueSuccessor() returns nullptr. + OrigDUBB = OrigDUBB->getUniqueSuccessor(); + } + // If OrigDUBB equals EHCont, then this is the last block we aim to update. + if (OrigDUBB == EHCont) { + BasicBlock *OldIDom = DT->getNode(EHCont)->getIDom()->getBlock(); + DT->changeImmediateDominator( + EHCont, DT->findNearestCommonDominator(OldIDom, NewDomCandidate)); + } + } else + ReplaceInstWithInst(NewHeader->getTerminator(), + DetachInst::Create(NewEntry, NewLatch, NewSyncReg)); + // Replace the old detach instruction with a branch + ReplaceInstWithInst(Header->getTerminator(), + BranchInst::Create(DI->getDetached())); + + // Replace the old reattach instructions with branches. Along the way, + // determine their common dominator. + BasicBlock *ReattachDom = nullptr; + for (Instruction *I : Reattaches) { + if (!ReattachDom) + ReattachDom = I->getParent(); + else + ReattachDom = DT->findNearestCommonDominator(ReattachDom, I->getParent()); + ReplaceInstWithInst(I, BranchInst::Create(Latch)); + } + assert(ReattachDom && "No reattach-dominator block found"); + // Insert a reattach at the end of NewReattB. + ReplaceInstWithInst(NewReattB->getTerminator(), + ReattachInst::Create(NewLatch, NewSyncReg)); + // Update the dominator tree, and determine predecessors of epilog. + if (DT->dominates(Header, Latch)) + DT->changeImmediateDominator(Latch, ReattachDom); + if (ParallelEpilog) + DT->changeImmediateDominator(LoopReattach, NewLatch); + else + EpilogPred = NewLatch; + + // The block structure of the stripmined loop should now look like so: + // + // LoopDetEntry + // NewHeader (detach NewEntry, NewLatch) + // NewEntry + // Header + // TaskEntry + // ... + // Latch (br Header, NewReattB) + // NewReattB (reattach NewLatch) + // NewLatch (br LoopReattach) + // LoopReattach + + // Add check of stripmined loop count. + IRBuilder<> B2(LoopDetEntry->getTerminator()); + + // We compute the loop count of the outer loop using a UDiv by the power-of-2 + // count to ensure that ScalarEvolution can handle this outer loop once we're + // done. + // + // TODO: Generalize to handle non-power-of-2 counts. + assert(isPowerOf2_32(Count) && "Count is not a power of 2."); + Value *TestVal = B2.CreateUDiv(TripCount, + ConstantInt::get(TripCount->getType(), Count), + "stripiter"); + // Value *TestVal = B2.CreateSub(TripCount, ModVal, "stripiter", true, true); + + // Value *TestCmp = B2.CreateICmpUGT(TestVal, + // ConstantInt::get(TestVal->getType(), 0), + // TestVal->getName() + ".ncmp"); + // ReplaceInstWithInst(NewPreheader->getTerminator(), + // BranchInst::Create(Header, LatchExit, TestCmp)); + // DT->changeImmediateDominator(LatchExit, + // DT->findNearestCommonDominator(LatchExit, + // NewPreheader)); + + // Add new counter for new outer loop. + // + // We introduce a new primary induction variable, NewIdx, into the outer loop, + // which counts up to the outer-loop trip count from 0, stepping by 1. In + // contrast to counting down from the outer-loop trip count, this new variable + // ensures that future loop passes, including LoopSpawning, can process this + // outer loop when we're done. + PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter", + NewHeader->getFirstNonPHI()); + B2.SetInsertPoint(NewLatch->getTerminator()); + // Instruction *IdxSub = cast( + // B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1), + // NewIdx->getName() + ".nsub")); + // IdxSub->copyIRFlags(PrimaryInc); + Instruction *IdxAdd = cast( + B2.CreateAdd(NewIdx, ConstantInt::get(NewIdx->getType(), 1), + NewIdx->getName() + ".nadd")); + IdxAdd->copyIRFlags(PrimaryInc); + + // NewIdx->addIncoming(TestVal, NewPreheader); + // NewIdx->addIncoming(IdxSub, NewLatch); + // Value *IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp"); + NewIdx->addIncoming(ConstantInt::get(TestVal->getType(), 0), LoopDetEntry); + NewIdx->addIncoming(IdxAdd, NewLatch); + Value *IdxCmp = B2.CreateICmpEQ(IdxAdd, TestVal, + NewIdx->getName() + ".ncmp"); + ReplaceInstWithInst(NewLatch->getTerminator(), + BranchInst::Create(LoopReattach, NewHeader, IdxCmp)); + DT->changeImmediateDominator(NewLatch, NewHeader); + // The block structure of the stripmined loop should now look like so: + // + // LoopDetEntry + // NewHeader (detach NewEntry, NewLatch) + // NewEntry + // Header + // TaskEntry + // ... + // Latch (br Header, NewReattB) + // NewReattB (reattach NewLatch) + // NewLatch (br NewHeader, LoopReattach) + // LoopReattach + + // If necessary, add the nested sync right before LoopReattach. + if (ParallelEpilog && NeedNestedSync) { + BasicBlock *NewLoopReattach = + SplitBlock(LoopReattach, LoopReattach->getTerminator(), DT, LI); + BasicBlock *NestedSyncBlock = LoopReattach; + LoopReattach = NewLoopReattach; + NestedSyncBlock->setName(Header->getName() + ".strpm.detachloop.sync"); + ReplaceInstWithInst(NestedSyncBlock->getTerminator(), + SyncInst::Create(LoopReattach, NewSyncReg)); + if (!OrigUnwindDest && F->doesNotThrow()) { + // Insert a call to sync.unwind. + CallInst *SyncUnwind = CallInst::Create( + Intrinsic::getDeclaration(M, Intrinsic::sync_unwind), { NewSyncReg }, + "", LoopReattach->getFirstNonPHIOrDbg()); + // If the Tapir loop has an unwind destination, change the sync.unwind to + // an invoke that unwinds to the cloned unwind destination. + if (OrigUnwindDest) { + BasicBlock *NewBB = + changeToInvokeAndSplitBasicBlock(SyncUnwind, OrigUnwindDest); + + // Update LI. + if (Loop *L = LI->getLoopFor(LoopReattach)) + L->addBasicBlockToLoop(NewBB, *LI); + + // Update DT: LoopReattach dominates Split, which dominates all other + // nodes previously dominated by LoopReattach. + if (DomTreeNode *OldNode = DT->getNode(LoopReattach)) { + std::vector Children(OldNode->begin(), OldNode->end()); + + DomTreeNode *NewNode = DT->addNewBlock(NewBB, LoopReattach); + for (DomTreeNode *I : Children) + DT->changeImmediateDominator(I, NewNode); + } + } + } + } + + // Fixup the LoopInfo for the new loop. + if (!ParentLoop) { + NewLoop->addBasicBlockToLoop(NewHeader, *LI); + NewLoop->addBasicBlockToLoop(NewEntry, *LI); + for (BasicBlock *BB : NewLoopBlocks) { + NewLoop->addBlockEntry(BB); + } + NewLoop->addBasicBlockToLoop(NewReattB, *LI); + NewLoop->addBasicBlockToLoop(NewLatch, *LI); + } else { + LI->changeLoopFor(NewHeader, NewLoop); + NewLoop->addBlockEntry(NewHeader); + LI->changeLoopFor(NewEntry, NewLoop); + NewLoop->addBlockEntry(NewEntry); + for (BasicBlock *BB : NewLoopBlocks) + NewLoop->addBlockEntry(BB); + LI->changeLoopFor(NewReattB, NewLoop); + NewLoop->addBlockEntry(NewReattB); + LI->changeLoopFor(NewLatch, NewLoop); + NewLoop->addBlockEntry(NewLatch); + } + // Update loop metadata + NewLoop->setLoopID(L->getLoopID()); + TapirLoopHints Hints(L); + Hints.clearHintsMetadata(); + + // Update all of the old PHI nodes + B2.SetInsertPoint(NewEntry->getTerminator()); + Instruction *CountVal = cast( + B2.CreateMul(ConstantInt::get(NewIdx->getType(), Count), + NewIdx)); + CountVal->copyIRFlags(PrimaryInduction); + for (auto &InductionEntry : *TL.getInductionVars()) { + PHINode *OrigPhi = InductionEntry.first; + const InductionDescriptor &II = InductionEntry.second; + if (II.getStep()->isZero()) + // Nothing to do for this Phi + continue; + // Get the new step value for this Phi. + Value *PhiCount = !II.getStep()->getType()->isIntegerTy() + ? B2.CreateCast(Instruction::SIToFP, CountVal, + II.getStep()->getType()) + : B2.CreateSExtOrTrunc(CountVal, II.getStep()->getType()); + Value *NewStart = emitTransformedIndex(B2, PhiCount, SE, DL, II); + + // Get the old increment instruction for this Phi + int Idx = OrigPhi->getBasicBlockIndex(NewEntry); + OrigPhi->setIncomingValue(Idx, NewStart); + } + + // Add new induction variable for inner loop. + PHINode *InnerIdx = PHINode::Create(PrimaryInduction->getType(), 2, + "inneriter", + Header->getFirstNonPHI()); + Value *InnerTestVal = ConstantInt::get(PrimaryInduction->getType(), Count); + B2.SetInsertPoint(LatchBR); + Instruction *InnerSub = cast( + B2.CreateSub(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1), + InnerIdx->getName() + ".nsub")); + InnerSub->copyIRFlags(PrimaryInc); + // Instruction *InnerAdd = cast( + // B2.CreateAdd(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1), + // InnerIdx->getName() + ".nadd")); + // InnerAdd->copyIRFlags(PrimaryInc); + Value *InnerCmp; + if (LatchBR->getSuccessor(0) == Header) + InnerCmp = B2.CreateIsNotNull(InnerSub, InnerIdx->getName() + ".ncmp"); + else + InnerCmp = B2.CreateIsNull(InnerSub, InnerIdx->getName() + ".ncmp"); + InnerIdx->addIncoming(InnerTestVal, NewEntry); + InnerIdx->addIncoming(InnerSub, Latch); + // if (LatchBR->getSuccessor(0) == Header) + // InnerCmp = B2.CreateICmpNE(InnerAdd, InnerTestVal, + // InnerIdx->getName() + ".ncmp"); + // else + // InnerCmp = B2.CreateICmpEQ(InnerAdd, InnerTestVal, + // InnerIdx->getName() + ".ncmp"); + // InnerIdx->addIncoming(ConstantInt::get(InnerIdx->getType(), 0), NewEntry); + // InnerIdx->addIncoming(InnerAdd, Latch); + LatchBR->setCondition(InnerCmp); + + // Connect the epilog code to the original loop and update the PHI functions. + B2.SetInsertPoint(EpilogPreheader->getTerminator()); + + // Compute the start of the epilog iterations. We use a divide and multiply + // by the power-of-2 count to simplify the SCEV's of the induction variables + // for later analysis passes. + // Value *EpilStartIter = B2.CreateSub(TripCount, ModVal); + Value *EpilStartIter = + B2.CreateMul(B2.CreateUDiv(TripCount, + ConstantInt::get(TripCount->getType(), Count)), + ConstantInt::get(TripCount->getType(), Count)); + if (Instruction *ESIInst = dyn_cast(EpilStartIter)) + ESIInst->copyIRFlags(PrimaryInc); + ConnectEpilog(TL, EpilStartIter, ModVal, EpilogPred, LoopReattach, NewExit, + LatchExit, Preheader, EpilogPreheader, VMap, DT, LI, SE, DL, + PreserveLCSSA); + + // If this loop is nested, then the loop stripminer changes the code in the + // any of its parent loops, so the Scalar Evolution pass needs to be run + // again. + SE->forgetTopmostLoop(L); + + // FIXME: Optionally unroll remainder loop + // + // if (RemainderLoop && UnrollRemainder) { + // LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n"); + // UnrollLoop(RemainderLoop, /*Count*/ Count - 1, /*TripCount*/ Count - 1, + // /*Force*/ false, /*AllowRuntime*/ false, + // /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true, + // /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1, + // /*PeelCount*/ 0, /*UnrollRemainder*/ false, LI, SE, DT, AC, + // /*TI*/ nullptr, /*ORE*/ nullptr, /*PreserveLCSSA*/ true); + // } + + // Record that the remainder loop was derived from a Tapir loop. + (*RemainderLoop)->setDerivedFromTapirLoop(); + + // At this point, the code is well formed. We now simplify the new loops, + // doing constant propagation and dead code elimination as we go. + simplifyLoopAfterStripMine(L, /*SimplifyIVs*/ true, LI, SE, DT, TTI, AC); + simplifyLoopAfterStripMine(NewLoop, /*SimplifyIVs*/ true, LI, SE, DT, TTI, + AC); + simplifyLoopAfterStripMine(*RemainderLoop, /*SimplifyIVs*/ true, LI, SE, DT, + TTI, AC); + +#ifndef NDEBUG + DT->verify(); + LI->verify(*DT); +#endif + + // Record that the old loop was derived from a Tapir loop. + L->setDerivedFromTapirLoop(); + + // Update TaskInfo manually using the updated DT. + if (TI) + // FIXME: Recalculating TaskInfo for the whole function is wasteful. + // Optimize this routine in the future. + TI->recalculate(*F, *DT); + + return NewLoop; +} diff --git a/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp b/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp new file mode 100644 index 000000000000000..5a4921115cd91e0 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp @@ -0,0 +1,453 @@ +//===- LoopStripMinePass.cpp - Loop strip-mining pass ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to perform Tapir loop strip-mining. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/LoopStripMinePass.h" +#include "llvm/ADT/PriorityWorklist.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/WorkSpanAnalysis.h" +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/InstructionCost.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Tapir.h" +#include "llvm/Transforms/Tapir/LoopStripMine.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "loop-stripmine" + +cl::opt llvm::EnableTapirLoopStripmine( + "stripmine-loops", cl::init(true), cl::Hidden, + cl::desc("Run the Tapir Loop stripmining pass")); + +static cl::opt AllowParallelEpilog( + "allow-parallel-epilog", cl::Hidden, cl::init(true), + cl::desc("Allow stripmined Tapir loops to execute their epilogs in parallel.")); + +static cl::opt IncludeNestedSync( + "include-nested-sync", cl::Hidden, cl::init(true), + cl::desc("If the epilog is allowed to execute in parallel, include a sync " + "instruction in the nested task.")); + +static cl::opt RequireParallelEpilog( + "require-parallel-epilog", cl::Hidden, cl::init(false), + cl::desc("Require stripmined Tapir loops to execute their epilogs in " + "parallel. Intended for debugging.")); + +/// Create an analysis remark that explains why stripmining failed +/// +/// \p RemarkName is the identifier for the remark. If \p I is passed it is an +/// instruction that prevents vectorization. Otherwise \p TheLoop is used for +/// the location of the remark. \return the remark object that can be streamed +/// to. +static OptimizationRemarkAnalysis +createMissedAnalysis(StringRef RemarkName, const Loop *TheLoop, + Instruction *I = nullptr) { + const Value *CodeRegion = TheLoop->getHeader(); + DebugLoc DL = TheLoop->getStartLoc(); + + if (I) { + CodeRegion = I->getParent(); + // If there is no debug location attached to the instruction, revert back to + // using the loop's. + if (I->getDebugLoc()) + DL = I->getDebugLoc(); + } + + OptimizationRemarkAnalysis R(DEBUG_TYPE, RemarkName, DL, CodeRegion); + R << "loop not stripmined: "; + return R; +} + + +/// Approximate the work of the body of the loop L. Returns several relevant +/// properties of loop L via by-reference arguments. +static InstructionCost ApproximateLoopCost( + const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, + bool &Convergent, bool &IsRecursive, bool &UnknownSize, + const TargetTransformInfo &TTI, LoopInfo *LI, ScalarEvolution &SE, + const SmallPtrSetImpl &EphValues, + TargetLibraryInfo *TLI) { + + WSCost LoopCost; + estimateLoopCost(LoopCost, L, LI, &SE, TTI, TLI, EphValues); + + // Exclude calls to builtins when counting the calls. This assumes that all + // builtin functions are cheap. + NumCalls = LoopCost.Metrics.NumCalls - LoopCost.Metrics.NumBuiltinCalls; + NotDuplicatable = LoopCost.Metrics.notDuplicatable; + Convergent = LoopCost.Metrics.convergent; + IsRecursive = LoopCost.Metrics.isRecursive; + UnknownSize = LoopCost.UnknownCost; + + return LoopCost.Work; +} + +static bool tryToStripMineLoop( + Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, + const TargetTransformInfo &TTI, AssumptionCache &AC, TaskInfo *TI, + OptimizationRemarkEmitter &ORE, TargetLibraryInfo *TLI, bool PreserveLCSSA, + std::optional ProvidedCount) { + Task *T = getTaskIfTapirLoopStructure(L, TI); + if (!T) + return false; + TapirLoopHints Hints(L); + + if (TM_Disable == hasLoopStripmineTransformation(L)) + return false; + + LLVM_DEBUG(dbgs() << "Loop Strip Mine: F[" + << L->getHeader()->getParent()->getName() << "] Loop %" + << L->getHeader()->getName() << "\n"); + + if (!L->isLoopSimplifyForm()) { + LLVM_DEBUG( + dbgs() << " Not stripmining loop which is not in loop-simplify " + "form.\n"); + return false; + } + bool StripMiningRequested = + (hasLoopStripmineTransformation(L) == TM_ForcedByUser); + TargetTransformInfo::StripMiningPreferences SMP = + gatherStripMiningPreferences(L, SE, TTI, ProvidedCount); + + unsigned NumCalls = 0; + bool NotDuplicatable = false; + bool Convergent = false; + bool IsRecursive = false; + bool UnknownSize = false; + + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + + InstructionCost LoopCost = + ApproximateLoopCost(L, NumCalls, NotDuplicatable, Convergent, IsRecursive, + UnknownSize, TTI, LI, SE, EphValues, TLI); + // Determine the iteration count of the eventual stripmined the loop. + bool explicitCount = computeStripMineCount(L, TTI, LoopCost, SMP); + + // If the loop size is unknown, then we cannot compute a stripmining count for + // it. + if (!explicitCount && UnknownSize) { + LLVM_DEBUG(dbgs() << " Not stripmining loop with unknown size.\n"); + ORE.emit(createMissedAnalysis("UnknownSize", L) + << "Cannot stripmine loop with unknown size."); + return false; + } + + // If the loop size is enormous, then we might want to use a stripmining count + // of 1 for it. + LLVM_DEBUG(dbgs() << " Loop Cost = " << LoopCost << "\n"); + if (!explicitCount && InstructionCost::getMax() == LoopCost) { + LLVM_DEBUG(dbgs() << " Not stripmining loop with very large size.\n"); + if (Hints.getGrainsize() == 1) + return false; + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "HugeLoop", + L->getStartLoc(), L->getHeader()) + << "using grainsize 1 for huge loop"; + }); + Hints.setAlreadyStripMined(); + return true; + } + + // If the loop is recursive, set the stripmine factor to be 1. + if (!explicitCount && IsRecursive) { + LLVM_DEBUG(dbgs() << " Not stripmining loop that recursively calls the " + << "containing function.\n"); + if (Hints.getGrainsize() == 1) + return false; + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "RecursiveCalls", + L->getStartLoc(), L->getHeader()) + << "using grainsize 1 for loop with recursive calls"; + }); + Hints.setAlreadyStripMined(); + return true; + } + + // TODO: We can stripmine loops if the stripmined version does not require a + // prolog or epilog. + if (NotDuplicatable) { + LLVM_DEBUG(dbgs() << " Not stripmining loop which contains " + << "non-duplicatable instructions.\n"); + ORE.emit(createMissedAnalysis("NotDuplicatable", L) + << "Cannot stripmine loop with non-duplicatable instructions."); + return false; + } + + // If the loop contains a convergent operation, then the control flow + // introduced between the stripmined loop and epilog is unsafe -- it adds a + // control-flow dependency to the convergent operation. + if (Convergent) { + LLVM_DEBUG(dbgs() << " Skipping loop with convergent operations.\n"); + ORE.emit(createMissedAnalysis("Convergent", L) + << "Cannot stripmine loop with convergent instructions."); + return false; + } + + // If the loop contains potentially expensive function calls, then we don't + // want to stripmine it. + if (NumCalls > 0 && !explicitCount && !StripMiningRequested) { + LLVM_DEBUG(dbgs() << " Skipping loop with expensive function calls.\n"); + ORE.emit(createMissedAnalysis("ExpensiveCalls", L) + << "Not stripmining loop with potentially expensive calls."); + return false; + } + + // Make sure the count is a power of 2. + if (!isPowerOf2_32(SMP.Count)) + SMP.Count = NextPowerOf2(SMP.Count); + if (SMP.Count < 2) { + if (Hints.getGrainsize() == 1) + return false; + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "LargeLoop", + L->getStartLoc(), L->getHeader()) + << "using grainsize 1 for large loop"; + }); + Hints.setAlreadyStripMined(); + return true; + } + + // Find a constant trip count if available + unsigned ConstTripCount = getConstTripCount(L, SE); + + // Stripmining factor (Count) must be less or equal to TripCount. + if (ConstTripCount && SMP.Count >= ConstTripCount) { + ORE.emit(createMissedAnalysis("FullStripMine", L) + << "Stripmining count larger than loop trip count."); + ORE.emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "UnprofitableParallelLoop", + L->getStartLoc(), L->getHeader()) + << "Parallel loop does not appear profitable to parallelize."); + return false; + } + + // When is it worthwhile to allow the epilog to run in parallel with the + // stripmined loop? We expect the epilog to perform G/2 iterations on + // average, where G is the selected grainsize. Our goal is to ensure that + // these G/2 iterations offset the cost of an additional detach. + // Mathematically, this means + // + // (G/2) * S + d <= (1 + \eps) * G/2 * S , + // + // where S is the work of one loop iteration, d is the cost of a detach, and + // \eps is a sufficiently small constant, e.g., 1/C for a coarsening factor C. + // We assume that the choice of G is chosen such that G * \eps <= 1, which is + // true for the automatic computation of G aimed at ensuring the stripmined + // loop performs at most a (1 + \eps) factor more work than its serial + // projection. Solving the above equation thus shows that the epilog should + // be allowed to run in parallel when S >= 2 * d. We check for this case and + // encode the result in ParallelEpilog. + Instruction *DetachI = L->getHeader()->getTerminator(); + bool ParallelEpilog = + RequireParallelEpilog || + (AllowParallelEpilog && + ((SMP.Count < SMP.DefaultCoarseningFactor) || + (2 * TTI.getInstructionCost(DetachI, + TargetTransformInfo::TCK_SizeAndLatency)) <= + LoopCost)); + + // Some parallel runtimes, such as Cilk, require nested parallel tasks to be + // synchronized. + bool NeedNestedSync = IncludeNestedSync; + if (!NeedNestedSync && TLI) + NeedNestedSync = (TLI->getTapirTarget() == TapirTargetID::OpenCilk); + + // Save loop properties before it is transformed. + MDNode *OrigLoopID = L->getLoopID(); + + // Stripmine the loop + Loop *RemainderLoop = nullptr; + Loop *NewLoop = StripMineLoop(L, SMP.Count, SMP.AllowExpensiveTripCount, + SMP.UnrollRemainder, LI, &SE, &DT, TTI, &AC, TI, + &ORE, PreserveLCSSA, ParallelEpilog, + NeedNestedSync, &RemainderLoop); + if (!NewLoop) + return false; + + // Copy metadata to remainder loop + if (RemainderLoop && OrigLoopID) { + // Optional RemainderLoopID = makeFollowupLoopID( + // OrigLoopID, {}, "tapir.loop"); + MDNode *NewRemainderLoopID = + CopyNonTapirLoopMetadata(RemainderLoop->getLoopID(), OrigLoopID); + RemainderLoop->setLoopID(NewRemainderLoopID); + } + + // Mark the new loop as stripmined. + TapirLoopHints NewHints(NewLoop); + NewHints.setAlreadyStripMined(); + + return true; +} + +namespace { + +class LoopStripMine : public LoopPass { +public: + static char ID; // Pass ID, replacement for typeid + + std::optional ProvidedCount; + + LoopStripMine(std::optional Count = std::nullopt) + : LoopPass(ID), ProvidedCount(Count) { + initializeLoopStripMinePass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipLoop(L)) + return false; + + Function &F = *L->getHeader()->getParent(); + + auto &TLI = getAnalysis().getTLI(F); + auto &DT = getAnalysis().getDomTree(); + LoopInfo *LI = &getAnalysis().getLoopInfo(); + TaskInfo *TI = &getAnalysis().getTaskInfo(); + ScalarEvolution &SE = getAnalysis().getSE(); + const TargetTransformInfo &TTI = + getAnalysis().getTTI(F); + auto &AC = getAnalysis().getAssumptionCache(F); + // For the old PM, we can't use OptimizationRemarkEmitter as an analysis + // pass. Function analyses need to be preserved across loop transformations + // but ORE cannot be preserved (see comment before the pass definition). + OptimizationRemarkEmitter ORE(&F); + bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + + return tryToStripMineLoop(L, DT, LI, SE, TTI, AC, TI, ORE, &TLI, + PreserveLCSSA, ProvidedCount); + } + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG... + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + getLoopAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +char LoopStripMine::ID = 0; + +INITIALIZE_PASS_BEGIN(LoopStripMine, "loop-stripmine", "Stripmine Tapir loops", + false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(LoopStripMine, "loop-stripmine", "Stripmine Tapir loops", + false, false) + +Pass *llvm::createLoopStripMinePass(int Count) { + // TODO: It would make more sense for this function to take the optionals + // directly, but that's dangerous since it would silently break out of tree + // callers. + return new LoopStripMine(Count == -1 ? std::nullopt + : std::optional(Count)); +} + +PreservedAnalyses LoopStripMinePass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &TLI = AM.getResult(F); + auto &SE = AM.getResult(F); + auto &LI = AM.getResult(F); + auto &TTI = AM.getResult(F); + auto &DT = AM.getResult(F); + auto &AC = AM.getResult(F); + auto &TI = AM.getResult(F); + auto &ORE = AM.getResult(F); + + LoopAnalysisManager *LAM = nullptr; + if (auto *LAMProxy = AM.getCachedResult(F)) + LAM = &LAMProxy->getManager(); + + // const ModuleAnalysisManager &MAM = + // AM.getResult(F).getManager(); + // ProfileSummaryInfo *PSI = + // MAM.getCachedResult(*F.getParent()); + + bool Changed = false; + + // The stripminer requires loops to be in simplified form, and also needs + // LCSSA. Since simplification may add new inner loops, it has to run before + // the legality and profitability checks. This means running the loop + // stripminer will simplify all loops, regardless of whether anything end up + // being stripmined. + for (auto &L : LI) { + Changed |= simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, + /* PreserveLCSSA */ false); + Changed |= formLCSSARecursively(*L, DT, &LI, &SE); + } + + SmallPriorityWorklist Worklist; + appendLoopsToWorklist(LI, Worklist); + + while (!Worklist.empty()) { + // Because the LoopInfo stores the loops in RPO, we walk the worklist from + // back to front so that we work forward across the CFG, which for + // stripmining is only needed to get optimization remarks emitted in a + // forward order. + Loop &L = *Worklist.pop_back_val(); +#ifndef NDEBUG + Loop *ParentL = L.getParentLoop(); +#endif + + // // Check if the profile summary indicates that the profiled application + // // has a huge working set size, in which case we disable peeling to avoid + // // bloating it further. + // if (PSI && PSI->hasHugeWorkingSetSize()) + // AllowPeeling = false; + std::string LoopName = std::string(L.getName()); + bool LoopChanged = + tryToStripMineLoop(&L, DT, &LI, SE, TTI, AC, &TI, ORE, &TLI, + /*PreserveLCSSA*/ true, /*Count*/ std::nullopt); + Changed |= LoopChanged; + + // The parent must not be damaged by stripmining! +#ifndef NDEBUG + if (LoopChanged && ParentL) + ParentL->verifyLoop(); +#endif + + // Clear any cached analysis results for L if we removed it completely. + if (LAM && LoopChanged) + LAM->clear(L, LoopName); + } + + if (!Changed) + return PreservedAnalyses::all(); + + return getLoopPassPreservedAnalyses(); +} diff --git a/llvm/lib/Transforms/Tapir/LoweringUtils.cpp b/llvm/lib/Transforms/Tapir/LoweringUtils.cpp new file mode 100644 index 000000000000000..32131739275eec1 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/LoweringUtils.cpp @@ -0,0 +1,1269 @@ +//===- LoweringUtils.cpp - Utility functions for lowering Tapir -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements several utility functions for lowering Tapir. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Timer.h" +#include "llvm/Transforms/IPO/FunctionAttrs.h" +#include "llvm/Transforms/Tapir/LambdaABI.h" +#include "llvm/Transforms/Tapir/OMPTaskABI.h" +#include "llvm/Transforms/Tapir/OpenCilkABI.h" +#include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/Transforms/Tapir/QthreadsABI.h" +#include "llvm/Transforms/Tapir/SerialABI.h" +#include "llvm/Transforms/Tapir/TapirLoopInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "tapirlowering" + +static const char TimerGroupName[] = DEBUG_TYPE; +static const char TimerGroupDescription[] = "Tapir lowering"; + +TapirTarget *llvm::getTapirTargetFromID(Module &M, TapirTargetID ID) { + switch (ID) { + case TapirTargetID::None: + return nullptr; + case TapirTargetID::Serial: + return new SerialABI(M); + case TapirTargetID::Cheetah: + case TapirTargetID::OpenCilk: + return new OpenCilkABI(M); + case TapirTargetID::Lambda: + return new LambdaABI(M); + case TapirTargetID::OMPTask: + return new OMPTaskABI(M); + case TapirTargetID::Qthreads: + return new QthreadsABI(M); + default: + llvm_unreachable("Invalid TapirTargetID"); + } +} + +//----------------------------------------------------------------------------// +// Lowering utilities for Tapir tasks. + +/// Helper function to find the inputs and outputs to task T, based only the +/// blocks in T and no subtask of T. +static void +findTaskInputsOutputs(const Task *T, ValueSet &Inputs, ValueSet &Outputs, + const DominatorTree &DT) { + NamedRegionTimer NRT("findTaskInputsOutputs", "Find task inputs and outputs", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + // Get the sync region for this task's detach, so we can filter it out of this + // task's inputs. + const Value *SyncRegion = nullptr; + SmallPtrSet UnwindPHIs; + if (DetachInst *DI = T->getDetach()) { + SyncRegion = DI->getSyncRegion(); + // Ignore PHIs in the unwind destination of the detach. + if (DI->hasUnwindDest()) + UnwindPHIs.insert(DI->getUnwindDest()); + // Get the PHI nodes that directly or indirectly use the landing pad of the + // unwind destination of this task's detach. + getDetachUnwindPHIUses(DI, UnwindPHIs); + } + + for (Spindle *S : depth_first>(T->getEntrySpindle())) { + LLVM_DEBUG(dbgs() << "Examining spindle for inputs/outputs: " << *S + << "\n"); + for (BasicBlock *BB : S->blocks()) { + // Skip basic blocks that are successors of detached rethrows. They're + // dead anyway. + if (isSuccessorOfDetachedRethrow(BB) || isPlaceholderSuccessor(BB)) + continue; + + // If a used value is defined outside the region, it's an input. If an + // instruction is used outside the region, it's an output. + for (Instruction &II : *BB) { + // Examine all operands of this instruction. + for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE; + ++OI) { + + // If the operand of I is defined in the same basic block as I, then + // it's not an input. + if (Instruction *OP = dyn_cast(*OI)) + if (OP->getParent() == BB) + continue; + + // PHI nodes in the entry block of a shared-EH exit will be + // rewritten in any cloned helper, so we skip operands of these PHI + // nodes for blocks not in this task. + if (S->isSharedEH() && S->isEntry(BB)) + if (PHINode *PN = dyn_cast(&II)) { + LLVM_DEBUG(dbgs() + << "\tPHI node in shared-EH spindle: " << *PN << "\n"); + if (!T->simplyEncloses(PN->getIncomingBlock(*OI))) { + LLVM_DEBUG(dbgs() << "skipping\n"); + continue; + } + } + // If the operand is the sync region of this task's detach, skip it. + if (SyncRegion == *OI) + continue; + // If this operand is defined in the parent, it's an input. + if (T->definedInParent(*OI)) + Inputs.insert(*OI); + } + // Examine all uses of this instruction + for (User *U : II.users()) { + // If we find a live use outside of the task, it's an output. + if (Instruction *I = dyn_cast(U)) { + // Skip uses in PHI nodes that depend on the unwind landing pad of + // the detach. + if (UnwindPHIs.count(I->getParent())) + continue; + if (!T->encloses(I->getParent()) && + DT.isReachableFromEntry(I->getParent())) + Outputs.insert(&II); + } + } + } + } + } +} + +/// Determine the inputs for all tasks in this function. Returns a map from +/// tasks to their inputs. +/// +/// Aggregating all of this work into a single routine allows us to avoid +/// redundant traversals of basic blocks in nested tasks. +TaskValueSetMap llvm::findAllTaskInputs(Function &F, const DominatorTree &DT, + const TaskInfo &TI) { + TaskValueSetMap TaskInputs; + for (Task *T : post_order(TI.getRootTask())) { + // Skip the root task + if (T->isRootTask()) break; + + LLVM_DEBUG(dbgs() << "Finding inputs/outputs for task@" + << T->getEntry()->getName() << "\n"); + ValueSet Inputs, Outputs; + // Check all inputs of subtasks to determine if they're inputs to this task. + for (Task *SubT : T->subtasks()) { + LLVM_DEBUG(dbgs() << "\tsubtask @ " << SubT->getEntry()->getName() + << "\n"); + + if (TaskInputs.count(SubT)) + for (Value *V : TaskInputs[SubT]) + if (T->definedInParent(V)) + Inputs.insert(V); + } + + LLVM_DEBUG({ + dbgs() << "Subtask Inputs:\n"; + for (Value *V : Inputs) + dbgs() << "\t" << *V << "\n"; + dbgs() << "Subtask Outputs:\n"; + for (Value *V : Outputs) + dbgs() << "\t" << *V << "\n"; + }); + assert(Outputs.empty() && "Task should have no outputs."); + + // Find additional inputs and outputs of task T by examining blocks in T and + // not in any subtask of T. + findTaskInputsOutputs(T, Inputs, Outputs, DT); + + LLVM_DEBUG({ + dbgs() << "Inputs:\n"; + for (Value *V : Inputs) + dbgs() << "\t" << *V << "\n"; + dbgs() << "Outputs:\n"; + for (Value *V : Outputs) + dbgs() << "\t" << *V << "\n"; + }); + assert(Outputs.empty() && "Task should have no outputs."); + + // Map the computed inputs to this task. + TaskInputs[T] = Inputs; + } + return TaskInputs; +} + +// Helper function to check if a value is defined outside of a given spindle. +static bool definedOutsideTaskFrame(const Value *V, const Spindle *TF, + const TaskInfo &TI) { + // Arguments are always defined outside of spindles. + if (isa(V)) + return true; + + // If V is an instruction, check if TFSpindles contains it. + if (const Instruction *I = dyn_cast(V)) + return !taskFrameContains(TF, I->getParent(), TI); + + return false; +} + +/// Get the set of inputs for the given task T, accounting for the taskframe of +/// T, if it exists. +void llvm::getTaskFrameInputsOutputs(TFValueSetMap &TFInputs, + TFValueSetMap &TFOutputs, + const Spindle &TF, + const ValueSet *TaskInputs, + const TaskInfo &TI, + const DominatorTree &DT) { + NamedRegionTimer NRT("getTaskFrameInputsOutputs", + "Find taskframe inputs and outputs", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + + const Task *T = TF.getTaskFromTaskFrame(); + if (T) + LLVM_DEBUG(dbgs() << "getTaskFrameInputsOutputs: task@" + << T->getEntry()->getName() << "\n"); + else + LLVM_DEBUG(dbgs() << "getTaskFrameInputsOutputs: taskframe spindle@" + << TF.getEntry()->getName() << "\n"); + + // Check the taskframe spindles for definitions of inputs to T. + if (TaskInputs) + for (Value *V : *TaskInputs) + if (definedOutsideTaskFrame(V, &TF, TI)) + TFInputs[&TF].insert(V); + + // Get inputs from child taskframes. + for (Spindle *SubTF : TF.subtaskframes()) + for (Value *V : TFInputs[SubTF]) + if (definedOutsideTaskFrame(V, &TF, TI)) + TFInputs[&TF].insert(V); + + Value *TFCreate = T ? T->getTaskFrameUsed() : TF.getTaskFrameCreate(); + // Get inputs and outputs of the taskframe. + for (Spindle *S : TF.taskframe_spindles()) { + // Skip taskframe spindles within the task itself. + if (T && T->contains(S)) + continue; + + // Skip spindles that are placeholders. + if (isPlaceholderSuccessor(S->getEntry())) + continue; + + for (BasicBlock *BB : S->blocks()) { + for (Instruction &I : *BB) { + // Ignore certain instructions from consideration: the taskframe.create + // intrinsic for this taskframe, the detach instruction that spawns T, + // and the landingpad value in T's EH continuation. + if ((TFCreate == &I) || isa(&I) || + (T && T->getLPadValueInEHContinuationSpindle() == &I)) + continue; + + // Examine all operands of this instruction + for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; + ++OI) { + + // If the operand of I is defined in the same basic block as I, then + // it's not an input. + if (Instruction *OP = dyn_cast(*OI)) + if (OP->getParent() == BB) + continue; + + // Some canonicalization methods, e.g., loop canonicalization, will + // introduce a basic block after a detached-rethrow that branches to + // the successor of the EHContinuation entry. As a result, we can get + // PHI nodes that use the landingpad of a detached-rethrow. These + // PHI-node inputs will be rewritten anyway, so skip them. + if (isa(I)) + if (Instruction *OP = dyn_cast(*OI)) { + if (isa(*OP) && T && T->encloses(OP->getParent())) + if (isSuccessorOfDetachedRethrow(OP->getParent())) + continue; + // Also ignore PHI nodes in shared-eh spindles. + if (T && S->isSharedEH()) + continue; + } + + // Skip detached-rethrow calls in shared-eh spindles. + if (T && S->isSharedEH()) + if (isDetachedRethrow(&I)) + continue; + + // TODO: Add a test to exclude landingpads from detached-rethrows? + LLVM_DEBUG({ + if (Instruction *OP = dyn_cast(*OI)) { + assert(!(T && T->encloses(OP->getParent())) && + "TaskFrame uses value defined in task."); + } + }); + // If this operand is not defined outside of the taskframe, then it's + // an input. + if (definedOutsideTaskFrame(*OI, &TF, TI)) + TFInputs[&TF].insert(*OI); + } + // Examine all users of this instruction. + for (User *U : I.users()) { + // If we find a live use outside of the task, it's an output. + if (Instruction *UI = dyn_cast(U)) { + if (definedOutsideTaskFrame(UI, &TF, TI) && + DT.isReachableFromEntry(UI->getParent())) + TFOutputs[&TF].insert(&I); + } + } + } + } + } +} + +/// Determine the inputs for all taskframes in this function. Returns a map +/// from tasks to their inputs. +/// +/// Aggregating all of this work into a single routine allows us to avoid +/// redundant traversals of basic blocks in nested tasks. +void llvm::findAllTaskFrameInputs( + TFValueSetMap &TFInputs, TFValueSetMap &TFOutputs, + const SmallVectorImpl &AllTaskFrames, Function &F, + const DominatorTree &DT, TaskInfo &TI) { + // Determine the inputs for all tasks. + TaskValueSetMap TaskInputs = findAllTaskInputs(F, DT, TI); + + for (Spindle *TF : AllTaskFrames) { + Task *T = TF->getTaskFromTaskFrame(); + + // Update the inputs to account for the taskframe. + getTaskFrameInputsOutputs(TFInputs, TFOutputs, *TF, + T ? &TaskInputs[T] : nullptr, TI, DT); + + LLVM_DEBUG({ + dbgs() << "TFInputs:\n"; + for (Value *V : TFInputs[TF]) + dbgs() << "\t" << *V << "\n"; + dbgs() << "TFOutputs:\n"; + for (Value *V : TFOutputs[TF]) + dbgs() << "\t" << *V << "\n"; + }); + } +} + +/// Create a structure for storing all arguments to a task. +/// +/// NOTE: This function inserts the struct for task arguments in the same +/// location as the Reference compiler and other compilers that lower parallel +/// constructs in the front end. This location is NOT the correct place, +/// however, for handling tasks that are spawned inside of a serial loop. +std::pair +llvm::createTaskArgsStruct(const ValueSet &Inputs, Task *T, + Instruction *StorePt, Instruction *LoadPt, + bool staticStruct, ValueToValueMapTy &InputsMap, + Loop *TapirL) { + assert(T && T->getParentTask() && "Expected spawned task."); + SmallPtrSet TaskFrameBlocks; + if (Spindle *TFCreateSpindle = T->getTaskFrameCreateSpindle()) { + // Collect taskframe blocks + for (Spindle *S : TFCreateSpindle->taskframe_spindles()) { + // Skip spindles contained in the task. + if (T->contains(S)) + continue; + // Skip placeholder spindles. + if (isPlaceholderSuccessor(S->getEntry())) + continue; + + for (BasicBlock *B : S->blocks()) + TaskFrameBlocks.insert(B); + } + } + assert((T->encloses(LoadPt->getParent()) || + TaskFrameBlocks.contains(LoadPt->getParent()) || + (TapirL && LoadPt->getParent() == TapirL->getHeader())) && + "Loads of struct arguments must be inside task."); + assert(!T->encloses(StorePt->getParent()) && + !TaskFrameBlocks.contains(StorePt->getParent()) && + "Store of struct arguments must be outside task."); + assert(T->getParentTask()->encloses(StorePt->getParent()) && + "Store of struct arguments expected to be in parent task."); + SmallVector InputsToSort; + { + for (Value *V : Inputs) + InputsToSort.push_back(V); + Function *F = T->getEntry()->getParent(); + const DataLayout &DL = F->getParent()->getDataLayout(); + std::sort(InputsToSort.begin(), InputsToSort.end(), + [&DL](const Value *A, const Value *B) { + return DL.getTypeSizeInBits(A->getType()) > + DL.getTypeSizeInBits(B->getType()); + }); + } + + // Get vector of struct inputs and their types. + SmallVector StructInputs; + SmallVector StructIT; + for (Value *V : InputsToSort) { + StructInputs.push_back(V); + StructIT.push_back(V->getType()); + } + + // Create an alloca for this struct in the parent task's entry block. + Instruction *ArgsStart = StorePt; + IRBuilder<> B(StorePt); + // TODO: Add lifetime intrinsics for this allocation. + AllocaInst *Closure; + StructType *ST = StructType::get(T->getEntry()->getContext(), StructIT); + LLVM_DEBUG(dbgs() << "Closure struct type " << *ST << "\n"); + if (staticStruct) { + Spindle *ParentTF = T->getEntrySpindle()->getTaskFrameParent(); + BasicBlock *AllocaInsertBlk = + ParentTF ? ParentTF->getEntry() : T->getParentTask()->getEntry(); + Value *TFCreate = ParentTF ? ParentTF->getTaskFrameCreate() : nullptr; + IRBuilder<> Builder(TFCreate + ? &*++cast(TFCreate)->getIterator() + : &*AllocaInsertBlk->getFirstInsertionPt()); + Closure = Builder.CreateAlloca(ST); + // Store arguments into the structure + if (!StructInputs.empty()) + ArgsStart = B.CreateStore(StructInputs[0], + B.CreateConstGEP2_32(ST, Closure, 0, 0)); + for (unsigned i = 1; i < StructInputs.size(); ++i) + B.CreateStore(StructInputs[i], B.CreateConstGEP2_32(ST, Closure, 0, i)); + } else { + // Add code to store values into struct immediately before detach. + Closure = B.CreateAlloca(ST); + ArgsStart = Closure; + // Store arguments into the structure + for (unsigned i = 0; i < StructInputs.size(); ++i) + B.CreateStore(StructInputs[i], B.CreateConstGEP2_32(ST, Closure, 0, i)); + } + + // Add code to load values from struct in task entry and use those loaded + // values. + IRBuilder<> B2(LoadPt); + for (unsigned i = 0; i < StructInputs.size(); ++i) { + auto STGEP = cast(B2.CreateConstGEP2_32(ST, Closure, 0, i)); + auto STLoad = B2.CreateLoad(StructIT[i], STGEP); + InputsMap[StructInputs[i]] = STLoad; + + // Update all uses of the struct inputs in the loop body. + auto UI = StructInputs[i]->use_begin(), E = StructInputs[i]->use_end(); + for (; UI != E;) { + Use &U = *UI; + ++UI; + auto *Usr = dyn_cast(U.getUser()); + if (!Usr) + continue; + if ((!T->encloses(Usr->getParent()) && + !TaskFrameBlocks.contains(Usr->getParent()) && + (!TapirL || (Usr->getParent() != TapirL->getHeader() && + Usr->getParent() != TapirL->getLoopLatch())))) + continue; + U.set(STLoad); + } + } + + return std::make_pair(Closure, ArgsStart); +} + +/// Organize the set \p Inputs of values in \p F into a set \p Fixed of values +/// that can be used as inputs to a helper function. +void llvm::fixupInputSet(Function &F, const ValueSet &Inputs, ValueSet &Fixed) { + // Scan for any sret parameters in TaskInputs and add them first. These + // parameters must appear first or second in the prototype of the Helper + // function. + Value *SRetInput = nullptr; + if (F.hasStructRetAttr()) { + Function::arg_iterator ArgIter = F.arg_begin(); + if (F.hasParamAttribute(0, Attribute::StructRet)) + if (Inputs.count(&*ArgIter)) + SRetInput = &*ArgIter; + if (F.hasParamAttribute(1, Attribute::StructRet)) { + ++ArgIter; + if (Inputs.count(&*ArgIter)) + SRetInput = &*ArgIter; + } + } + if (SRetInput) { + LLVM_DEBUG(dbgs() << "sret input " << *SRetInput << "\n"); + Fixed.insert(SRetInput); + } + + // Sort the inputs to the task with largest arguments first, in order to + // improve packing or arguments in memory. + SmallVector InputsToSort; + for (Value *V : Inputs) + if (V != SRetInput) + InputsToSort.push_back(V); + LLVM_DEBUG({ + dbgs() << "After sorting:\n"; + for (Value *V : InputsToSort) + dbgs() << "\t" << *V << "\n"; + }); + const DataLayout &DL = F.getParent()->getDataLayout(); + std::sort(InputsToSort.begin(), InputsToSort.end(), + [&DL](const Value *A, const Value *B) { + return DL.getTypeSizeInBits(A->getType()) > + DL.getTypeSizeInBits(B->getType()); + }); + + // Add the remaining inputs. + for (Value *V : InputsToSort) + if (!Fixed.count(V)) + Fixed.insert(V); +} + +/// Organize the inputs to task \p T, given in \p TaskInputs, to create an +/// appropriate set of inputs, \p HelperInputs, to pass to the outlined +/// function for \p T. +Instruction *llvm::fixupHelperInputs( + Function &F, Task *T, ValueSet &TaskInputs, ValueSet &HelperArgs, + Instruction *StorePt, Instruction *LoadPt, + TapirTarget::ArgStructMode useArgStruct, + ValueToValueMapTy &InputsMap, Loop *TapirL) { + if (TapirTarget::ArgStructMode::None != useArgStruct) { + std::pair ArgsStructInfo = + createTaskArgsStruct(TaskInputs, T, StorePt, LoadPt, + TapirTarget::ArgStructMode::Static == useArgStruct, + InputsMap, TapirL); + HelperArgs.insert(ArgsStructInfo.first); + return ArgsStructInfo.second; + } + + fixupInputSet(F, TaskInputs, HelperArgs); + return StorePt; +} + +/// Returns true if BasicBlock \p B is the immediate successor of only +/// detached-rethrow instructions. +bool llvm::isSuccessorOfDetachedRethrow(const BasicBlock *B) { + for (const BasicBlock *Pred : predecessors(B)) + if (!isDetachedRethrow(Pred->getTerminator())) + return false; + return true; +} + +/// Collect the set of blocks in task \p T. All blocks enclosed by \p T will be +/// pushed onto \p TaskBlocks. The set of blocks terminated by reattaches from +/// \p T are added to \p ReattachBlocks. The set of blocks terminated by +/// detached-rethrow instructions are added to \p TaskResumeBlocks. The set of +/// entry points to exception-handling blocks shared by \p T and other tasks in +/// the same function are added to \p SharedEHEntries. +void llvm::getTaskBlocks(Task *T, std::vector &TaskBlocks, + SmallPtrSetImpl &ReattachBlocks, + SmallPtrSetImpl &TaskResumeBlocks, + SmallPtrSetImpl &SharedEHEntries, + const DominatorTree *DT) { + NamedRegionTimer NRT("getTaskBlocks", "Get task blocks", TimerGroupName, + TimerGroupDescription, TimePassesIsEnabled); + SmallPtrSet SpindlesToExclude; + for (Spindle *TFSpindle : T->taskframe_creates()) + for (Spindle *S : TFSpindle->taskframe_spindles()) + SpindlesToExclude.insert(S); + + // Add taskframe-spindle blocks. + if (Spindle *TFCreateSpindle = T->getTaskFrameCreateSpindle()) { + for (Spindle *S : TFCreateSpindle->taskframe_spindles()) { + if (T->contains(S)) + continue; + + // Skip spindles that are placeholders. + if (isPlaceholderSuccessor(S->getEntry())) + continue; + + LLVM_DEBUG(dbgs() << "Adding blocks in taskframe spindle " << *S << "\n"); + assert(!SpindlesToExclude.count(S) && + "Taskframe spindle marked for exclusion."); + + if (T->getEHContinuationSpindle() == S) + SharedEHEntries.insert(S->getEntry()); + else { + // Some canonicalization methods, e.g., loop canonicalization, will + // introduce a basic block after a detached-rethrow that branches to the + // successor of the EHContinuation entry. + for (BasicBlock *Pred : predecessors(S->getEntry())) + if (isSuccessorOfDetachedRethrow(Pred)) + SharedEHEntries.insert(S->getEntry()); + } + + for (BasicBlock *B : S->blocks()) { + LLVM_DEBUG(dbgs() << "Adding task block " << B->getName() << "\n"); + TaskBlocks.push_back(B); + + if (isTaskFrameResume(B->getTerminator())) + TaskResumeBlocks.insert(B); + } + } + } + + // Record the predecessor spindles of the EH continuation, if there is one. + Spindle *EHContinuation = T->getEHContinuationSpindle(); + SmallPtrSet EHContPred; + if (EHContinuation) + for (Spindle *Pred : predecessors(EHContinuation)) + EHContPred.insert(Pred); + + // Add the spindles in the task proper. + for (Spindle *S : depth_first>(T->getEntrySpindle())) { + if (SpindlesToExclude.count(S)) + continue; + + LLVM_DEBUG(dbgs() << "Adding task blocks in spindle " << *S << "\n"); + + // Record the entry blocks of any shared-EH spindles. + if (S->isSharedEH()) + SharedEHEntries.insert(S->getEntry()); + + // At -O0, the always-inliner can create blocks in the predecessor spindles + // of the EH continuation that are not reachable from the entry. These + // blocks are not cloned, but we mark them as shared EH entries so that + // outlining can correct any PHI nodes in those blocks. + if (EHContPred.count(S)) + for (BasicBlock *B : S->blocks()) + for (BasicBlock *Pred : predecessors(B)) + if (!DT->isReachableFromEntry(Pred)) { + SharedEHEntries.insert(B); + break; + } + + for (BasicBlock *B : S->blocks()) { + // Skip basic blocks that are successors of detached rethrows. They're + // dead anyway. + if (isSuccessorOfDetachedRethrow(B) || isPlaceholderSuccessor(B)) + continue; + + LLVM_DEBUG(dbgs() << "Adding task block " << B->getName() << "\n"); + TaskBlocks.push_back(B); + + // Record the blocks terminated by reattaches and detached rethrows. + if (isa(B->getTerminator())) + ReattachBlocks.insert(B); + if (isDetachedRethrow(B->getTerminator())) + TaskResumeBlocks.insert(B); + } + } +} + +/// Outlines the content of task \p T in function \p F into a new helper +/// function. The parameter \p Inputs specified the inputs to the helper +/// function. The map \p VMap is updated with the mapping of instructions in +/// \p T to instructions in the new helper function. +Function *llvm::createHelperForTask( + Function &F, Task *T, ValueSet &Args, Module *DestM, + ValueToValueMapTy &VMap, Type *ReturnType, OutlineAnalysis &OA) { + // Collect all basic blocks in this task. + std::vector TaskBlocks; + // Reattach instructions and detached rethrows in this task might need special + // handling. + SmallPtrSet ReattachBlocks; + SmallPtrSet TaskResumeBlocks; + // Entry blocks of shared-EH spindles may contain PHI nodes that need to be + // rewritten in the cloned helper. + SmallPtrSet SharedEHEntries; + getTaskBlocks(T, TaskBlocks, ReattachBlocks, TaskResumeBlocks, + SharedEHEntries, &OA.DT); + + SmallVector Returns; // Ignore returns cloned. + ValueSet Outputs; + DetachInst *DI = T->getDetach(); + + BasicBlock *Header = T->getEntry(); + BasicBlock *Entry = DI->getParent(); + if (Spindle *TaskFrameCreate = T->getTaskFrameCreateSpindle()) { + Header = TaskFrameCreate->getEntry(); + Entry = Header->getSinglePredecessor(); + } + + Twine NameSuffix = ".otd" + Twine(T->getTaskDepth()); + Function *Helper; + { + NamedRegionTimer NRT("CreateHelper", "Create helper function", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + std::unique_ptr Mat = + std::make_unique( + dyn_cast(DI->getSyncRegion())); + Helper = CreateHelper( + Args, Outputs, TaskBlocks, Header, Entry, DI->getContinue(), VMap, + DestM, F.getSubprogram() != nullptr, Returns, NameSuffix.str(), + &ReattachBlocks, &TaskResumeBlocks, &SharedEHEntries, nullptr, nullptr, + ReturnType, nullptr, nullptr, Mat.get()); + } + assert(Returns.empty() && "Returns cloned when cloning detached CFG."); + + // Add alignment assumptions to arguments of helper, based on alignment of + // values in old function. + AddAlignmentAssumptions(&F, Args, VMap, DI, &OA.AC, &OA.DT); + + // Move allocas in the newly cloned detached CFG to the entry block of the + // helper. + { + NamedRegionTimer NRT("MoveAllocas", "Move allocas in cloned helper", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + // Collect the end instructions of the task. + SmallVector TaskEnds; + for (BasicBlock *EndBlock : ReattachBlocks) + TaskEnds.push_back(cast(VMap[EndBlock])->getTerminator()); + for (BasicBlock *EndBlock : TaskResumeBlocks) + TaskEnds.push_back(cast(VMap[EndBlock])->getTerminator()); + + // Move allocas in cloned detached block to entry of helper function. + BasicBlock *ClonedDetachedBlock = cast(VMap[T->getEntry()]); + MoveStaticAllocasInBlock(&Helper->getEntryBlock(), ClonedDetachedBlock, + TaskEnds); + + // If this task uses a taskframe, move allocas in cloned taskframe entry to + // entry of helper function. + if (Spindle *TFCreate = T->getTaskFrameCreateSpindle()) { + BasicBlock *ClonedTFEntry = cast(VMap[TFCreate->getEntry()]); + MoveStaticAllocasInBlock(&Helper->getEntryBlock(), ClonedTFEntry, + TaskEnds); + } + + // We do not need to add new llvm.stacksave/llvm.stackrestore intrinsics, + // because calling and returning from the helper will automatically manage + // the stack appropriately. + } + + // Convert the cloned detach into an unconditional branch. We do this + // conversion here in part to prevent the cloned task from being reprocessed. + if (VMap[DI]) { + NamedRegionTimer NRT("serializeClone", "Serialize cloned Tapir task", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + DetachInst *ClonedDI = cast(VMap[DI]); + BasicBlock *ClonedDetacher = ClonedDI->getParent(); + BasicBlock *ClonedContinue = ClonedDI->getContinue(); + ClonedContinue->removePredecessor(ClonedDetacher); + BranchInst *DetachRepl = BranchInst::Create(ClonedDI->getDetached()); + ReplaceInstWithInst(ClonedDI, DetachRepl); + VMap[DI] = DetachRepl; + } + + Helper->setMemoryEffects(computeFunctionBodyMemoryAccess(*Helper, OA.AA)); + + return Helper; +} + +/// Helper function to unlink task T's exception-handling blocks from T's +/// parent. +static void unlinkTaskEHFromParent(Task *T) { + DetachInst *DI = T->getDetach(); + + SmallPtrSet UnwindPHIs; + if (DI->hasUnwindDest()) + // Get PHIs in the unwind destination of the detach. + UnwindPHIs.insert(DI->getUnwindDest()); + // Get the PHI's that use the landing pad of the detach's unwind. + getDetachUnwindPHIUses(DI, UnwindPHIs); + + SmallVector ToRemove; + // Look through PHI's that use the landing pad of the detach's unwind, and + // update those PHI's to not refer to task T. + for (BasicBlock *BB : UnwindPHIs) { + for (BasicBlock *Pred : predecessors(BB)) { + // Ignore the shared-EH spindles in T, because those might be used by + // other subtasks of T's parent. The shared-EH spindles tracked by T's + // parent will be handled once all subtasks of T's parent have been + // processed. + if (T->simplyEncloses(Pred) && !T->encloses(BB) && + T->getParentTask()->encloses(BB)) { + // Update the PHI nodes in BB. + BB->removePredecessor(Pred); + // Remove the edge from Pred to BB. + IRBuilder<> B(Pred->getTerminator()); + Instruction *Unreach = B.CreateUnreachable(); + Unreach->setDebugLoc(Pred->getTerminator()->getDebugLoc()); + ToRemove.push_back(Pred->getTerminator()); + } + } + } + + // Remove the terminators we no longer need. + for (Instruction *I : ToRemove) + I->eraseFromParent(); +} + +static BasicBlock *getTaskFrameContinue(Spindle *TF) { + Value *TFCreate = TF->getTaskFrameCreate(); + for (User *U : TFCreate->users()) { + if (IntrinsicInst *UI = dyn_cast(U)) { + if (Intrinsic::taskframe_end == UI->getIntrinsicID()) + return UI->getParent()->getSingleSuccessor(); + } + } + return nullptr; +} + +/// Outlines the content of taskframe \p TF in function \p F into a new helper +/// function. The parameter \p Inputs specified the inputs to the helper +/// function. The map \p VMap is updated with the mapping of instructions in \p +/// TF to instructions in the new helper function. +Function *llvm::createHelperForTaskFrame( + Function &F, Spindle *TF, ValueSet &Args, Module *DestM, + ValueToValueMapTy &VMap, Type *ReturnType, OutlineAnalysis &OA) { + // Collect all basic blocks in this task. + std::vector TaskBlocks; + // Reattach instructions and detached rethrows in this task might need special + // handling. + SmallPtrSet TFEndBlocks; + SmallPtrSet TFResumeBlocks; + // Entry blocks of shared-EH spindles may contain PHI nodes that need to be + // rewritten in the cloned helper. + SmallPtrSet SharedEHEntries; + { + NamedRegionTimer NRT("getTaskFrameBlocks", "Get taskframe blocks", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + // Get taskframe blocks + for (Spindle *S : TF->taskframe_spindles()) { + // Skip spindles that are placeholders. + if (isPlaceholderSuccessor(S->getEntry())) + continue; + + LLVM_DEBUG(dbgs() << "Adding blocks in taskframe spindle " << *S << "\n"); + + // Some canonicalization methods, e.g., loop canonicalization, will + // introduce a basic block after a detached-rethrow that branches to the + // successor of the EHContinuation entry. + for (BasicBlock *Pred : predecessors(S->getEntry())) { + assert(!endsTaskFrame(Pred, TF->getTaskFrameCreate()) && + "Taskframe spindle after taskframe.end"); + if (isDetachedRethrow(Pred->getTerminator())) + SharedEHEntries.insert(S->getEntry()); + if (isSuccessorOfDetachedRethrow(Pred)) + SharedEHEntries.insert(S->getEntry()); + } + + // Terminate landingpads might be shared between a taskframe and its parent. + // It's safe to clone these blocks, but we need to be careful about PHI + // nodes. + if (S != TF) { + for (Spindle *PredS : predecessors(S)) { + if (!TF->taskFrameContains(PredS)) { + LLVM_DEBUG( + dbgs() + << "Taskframe spindle has predecessor outside of taskframe: " + << *S << "\n"); + SharedEHEntries.insert(S->getEntry()); + break; + } + } + } + + for (BasicBlock *B : S->blocks()) { + LLVM_DEBUG(dbgs() << "Adding taskframe block " << B->getName() << "\n"); + TaskBlocks.push_back(B); + + // Record any blocks that end the taskframe. + if (endsTaskFrame(B)) { + LLVM_DEBUG(dbgs() << "Recording taskframe.end block " << B->getName() + << "\n"); + TFEndBlocks.insert(B); + } + if (isTaskFrameResume(B->getTerminator())) { + LLVM_DEBUG(dbgs() << "Recording taskframe.resume block " << B->getName() + << "\n"); + TFResumeBlocks.insert(B); + } + + // Terminate landingpads might be shared between a taskframe and its + // parent. It's safe to clone these blocks, but we need to be careful + // about PHI nodes. + if ((B != S->getEntry()) && B->isLandingPad()) { + for (BasicBlock *Pred : predecessors(B)) { + if (!S->contains(Pred)) { + LLVM_DEBUG(dbgs() << "Block within taskframe spindle has " + "predecessor outside of spindle.\n"); + SharedEHEntries.insert(B); + } + } + } + } + } + } // end timed region + + SmallVector Returns; // Ignore returns cloned. + ValueSet Outputs; + Value *TFCreate = TF->getTaskFrameCreate(); + + BasicBlock *Header = TF->getEntry(); + BasicBlock *Entry = Header->getSinglePredecessor(); + BasicBlock *Continue = getTaskFrameContinue(TF); + assert(Continue && "Task frame lacks a continuation for outlining."); + + Twine NameSuffix = ".otf" + Twine(TF->getTaskFrameDepth()); + Function *Helper; + { + NamedRegionTimer NRT("CreateHelper", "Create helper function", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + std::unique_ptr Mat = + std::make_unique(); + Helper = CreateHelper(Args, Outputs, TaskBlocks, Header, Entry, Continue, + VMap, DestM, F.getSubprogram() != nullptr, Returns, + NameSuffix.str(), &TFEndBlocks, &TFResumeBlocks, + &SharedEHEntries, nullptr, nullptr, ReturnType, + nullptr, nullptr, Mat.get()); + } // end timed region + assert(Returns.empty() && "Returns cloned when cloning detached CFG."); + + // Add alignment assumptions to arguments of helper, based on alignment of + // values in old function. + AddAlignmentAssumptions(&F, Args, VMap, &Header->front(), &OA.AC, &OA.DT); + + // Move allocas in the newly cloned detached CFG to the entry block of the + // helper. + { + NamedRegionTimer NRT("MoveAllocas", "Move allocas in cloned helper", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + // Collect the end instructions of the task. + SmallVector TaskEnds; + for (BasicBlock *EndBlock : TFEndBlocks) + TaskEnds.push_back(cast(VMap[EndBlock])->getTerminator()); + for (BasicBlock *EndBlock : TFResumeBlocks) + TaskEnds.push_back(cast(VMap[EndBlock])->getTerminator()); + + // Move allocas in cloned taskframe entry block to entry of helper function. + BasicBlock *ClonedTFEntry = cast(VMap[Header]); + MoveStaticAllocasInBlock(&Helper->getEntryBlock(), ClonedTFEntry, + TaskEnds); + + // We do not need to add new llvm.stacksave/llvm.stackrestore intrinsics, + // because calling and returning from the helper will automatically manage + // the stack appropriately. + } // end timed region + + // Remove the cloned taskframe.end intrinsics, to prevent the cloned taskframe + // from being reprocessed. + if (VMap[TFCreate]) { + NamedRegionTimer NRT("serializeClone", "Serialize cloned Tapir task", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + SmallVector TFEndsToRemove; + for (BasicBlock *EndBlock : TFEndBlocks) { + BasicBlock *ClonedEndBlock = cast(VMap[EndBlock]); + if (Instruction *Prev = ClonedEndBlock->getTerminator()->getPrevNode()) + if (isTapirIntrinsic(Intrinsic::taskframe_end, Prev)) + TFEndsToRemove.push_back(Prev); + } + for (Instruction *ClonedTFEnd : TFEndsToRemove) + ClonedTFEnd->eraseFromParent(); + } + + Helper->setMemoryEffects(computeFunctionBodyMemoryAccess(*Helper, OA.AA)); + + return Helper; +} + +/// Outlines a taskframe \p TF into a helper function that accepts the inputs \p +/// Inputs. The map \p VMap is updated with the mapping of instructions in \p +/// TF to instructions in the new helper function. Information about the helper +/// function is returned as a TaskOutlineInfo structure. +TaskOutlineInfo llvm::outlineTaskFrame( + Spindle *TF, ValueSet &Inputs, SmallVectorImpl &HelperInputs, + Module *DestM, ValueToValueMapTy &VMap, + TapirTarget::ArgStructMode useArgStruct, Type *ReturnType, + ValueToValueMapTy &InputMap, OutlineAnalysis &OA) { + if (Task *T = TF->getTaskFromTaskFrame()) + return outlineTask(T, Inputs, HelperInputs, DestM, VMap, useArgStruct, + ReturnType, InputMap, OA); + + Function &F = *TF->getEntry()->getParent(); + BasicBlock *Entry = TF->getEntry(); + + Instruction *StorePt = Entry->getSinglePredecessor()->getTerminator(); + // Find the corresponding taskframe.resume, if one exists. + BasicBlock *Unwind = getTaskFrameResumeDest(TF->getTaskFrameCreate()); + BasicBlock *Continue = getTaskFrameContinue(TF); + + // Convert the inputs of the task to inputs to the helper. + ValueSet HelperArgs; + // TODO: Consider supporting arg structs for arbitrary outlined taskframes. + fixupInputSet(F, Inputs, HelperArgs); + Instruction *ArgsStart = StorePt; + + for (Value *V : HelperArgs) + HelperInputs.push_back(V); + + // Clone the blocks into a helper function. + Function *Helper = createHelperForTaskFrame(F, TF, HelperArgs, DestM, VMap, + ReturnType, OA); + Instruction *ClonedTF = cast(VMap[TF->getTaskFrameCreate()]); + return TaskOutlineInfo(Helper, Entry, nullptr, ClonedTF, Inputs, + ArgsStart, StorePt, Continue, Unwind); +} + +/// Replaces the spawned task \p T, with associated TaskOutlineInfo \p Out, with +/// a call or invoke to the outlined helper function created for \p T. +Instruction *llvm::replaceTaskFrameWithCallToOutline( + Spindle *TF, TaskOutlineInfo &Out, SmallVectorImpl &OutlineInputs) { + if (Task *T = TF->getTaskFromTaskFrame()) + // Remove any dependencies from T's exception-handling code to T's parent. + unlinkTaskEHFromParent(T); + + Instruction *ToReplace = Out.ReplCall; + BasicBlock *TFResumeBB = nullptr; + if (Value *TFCreate = TF->getTaskFrameCreate()) + if (Instruction *TFResume = getTaskFrameResume(TFCreate)) + TFResumeBB = TFResume->getParent(); + + // Update PHI nodes in entry of taskframe. + TF->getEntry()->removePredecessor(ToReplace->getParent()); + + // Add call to new helper function in original function. + if (!Out.ReplUnwind) { + // Common case. Insert a call to the outline immediately before the detach. + CallInst *TopCall; + // Create call instruction. + IRBuilder<> Builder(Out.ReplCall); + TopCall = Builder.CreateCall(Out.Outline, OutlineInputs); + // Use a fast calling convention for the outline. + TopCall->setCallingConv(Out.Outline->getCallingConv()); + TopCall->setDebugLoc(ToReplace->getDebugLoc()); + if (Out.Outline->doesNotThrow()) + TopCall->setDoesNotThrow(); + // Replace the detach with an unconditional branch to its continuation. + ReplaceInstWithInst(ToReplace, BranchInst::Create(Out.ReplRet)); + return TopCall; + } else { + // The detach might catch an exception from the task. Replace the detach + // with an invoke of the outline. + InvokeInst *TopCall; + // Create invoke instruction. The ordinary return of the invoke is the + // detach's continuation, and the unwind return is the detach's unwind. + TopCall = InvokeInst::Create(Out.Outline, Out.ReplRet, Out.ReplUnwind, + OutlineInputs, "", ToReplace->getParent()); + if (TFResumeBB) { + // Update PHI nodes in the unwind destination of TFResumeBB. + for (PHINode &PN : Out.ReplUnwind->phis()) + PN.replaceIncomingBlockWith(TFResumeBB, ToReplace->getParent()); + // Replace the terminator of TFResumeBB with an unreachable. + IRBuilder<> B(TFResumeBB->getTerminator()); + B.CreateUnreachable()->setDebugLoc( + TFResumeBB->getTerminator()->getDebugLoc()); + TFResumeBB->getTerminator()->eraseFromParent(); + } + // Use a fast calling convention for the outline. + TopCall->setCallingConv(Out.Outline->getCallingConv()); + TopCall->setDebugLoc(ToReplace->getDebugLoc()); + // Remove the detach. The invoke serves as a replacement terminator. + ToReplace->eraseFromParent(); + return TopCall; + } +} + +/// Outlines a task \p T into a helper function that accepts the inputs \p +/// Inputs. The map \p VMap is updated with the mapping of instructions in \p T +/// to instructions in the new helper function. Information about the helper +/// function is returned as a TaskOutlineInfo structure. +TaskOutlineInfo llvm::outlineTask( + Task *T, ValueSet &Inputs, SmallVectorImpl &HelperInputs, + Module *DestM, ValueToValueMapTy &VMap, + TapirTarget::ArgStructMode useArgStruct, Type *ReturnType, + ValueToValueMapTy &InputMap, OutlineAnalysis &OA) { + assert(!T->isRootTask() && "Cannot outline the root task."); + Function &F = *T->getEntry()->getParent(); + DetachInst *DI = T->getDetach(); + Value *TFCreate = T->getTaskFrameUsed(); + + Instruction *LoadPt = T->getEntry()->getFirstNonPHIOrDbgOrLifetime(); + Instruction *StorePt = DI; + BasicBlock *Unwind = DI->getUnwindDest(); + if (Spindle *TaskFrameCreate = T->getTaskFrameCreateSpindle()) { + // If this task uses a taskframe, determine load and store points based on + // taskframe intrinsics. + LoadPt = &*++TaskFrameCreate->getEntry()->begin(); + StorePt = + TaskFrameCreate->getEntry()->getSinglePredecessor()->getTerminator(); + // Ensure debug information on StorePt + if (!StorePt->getDebugLoc()) + StorePt->setDebugLoc(DI->getDebugLoc()); + if (Unwind) + // Find the corresponding taskframe.resume. + Unwind = getTaskFrameResumeDest(T->getTaskFrameUsed()); + } + + // Convert the inputs of the task to inputs to the helper. + ValueSet HelperArgs; + Instruction *ArgsStart = fixupHelperInputs(F, T, Inputs, HelperArgs, StorePt, + LoadPt, useArgStruct, InputMap); + for (Value *V : HelperArgs) + HelperInputs.push_back(V); + + // Clone the blocks into a helper function. + Function *Helper = createHelperForTask(F, T, HelperArgs, DestM, VMap, + ReturnType, OA); + Value *ClonedTFCreate = TFCreate ? VMap[TFCreate] : nullptr; + return TaskOutlineInfo(Helper, T->getEntry(), + dyn_cast_or_null(VMap[DI]), + dyn_cast_or_null(ClonedTFCreate), Inputs, + ArgsStart, StorePt, DI->getContinue(), Unwind); +} + +//----------------------------------------------------------------------------// +// Methods for lowering Tapir loops + +/// Returns true if the value \p V is defined outside the set \p Blocks of basic +/// blocks in a function. +static bool definedOutsideBlocks(const Value *V, + SmallPtrSetImpl &Blocks) { + if (isa(V)) return true; + if (const Instruction *I = dyn_cast(V)) + return !Blocks.count(I->getParent()); + return false; +} + +/// Returns true if the value V used inside the body of Tapir loop L is defined +/// outside of L. +static bool taskInputDefinedOutsideLoop(const Value *V, const Loop *L) { + if (isa(V)) + return true; + + const BasicBlock *Header = L->getHeader(); + const BasicBlock *Latch = L->getLoopLatch(); + if (const Instruction *I = dyn_cast(V)) + if ((Header != I->getParent()) && (Latch != I->getParent())) + return true; + return false; +} + +/// Given a Tapir loop \p TL and the set of inputs to the task inside that loop, +/// returns the set of inputs for the Tapir loop itself. +ValueSet llvm::getTapirLoopInputs(TapirLoopInfo *TL, ValueSet &TaskInputs) { + Loop *L = TL->getLoop(); + Task *T = TL->getTask(); + ValueSet LoopInputs; + + for (Value *V : TaskInputs) + if (taskInputDefinedOutsideLoop(V, L)) + LoopInputs.insert(V); + + const Value *SyncRegion = T->getDetach()->getSyncRegion(); + + SmallPtrSet BlocksToCheck; + BlocksToCheck.insert(L->getHeader()); + BlocksToCheck.insert(L->getLoopLatch()); + for (BasicBlock *BB : BlocksToCheck) { + for (Instruction &II : *BB) { + // Skip the condition of this loop, since we will process that specially. + if (TL->getCondition() == &II) continue; + // Examine all operands of this instruction. + for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE; + ++OI) { + // If the operand is the sync region of this task's detach, skip it. + if (SyncRegion == *OI) + continue; + LLVM_DEBUG({ + if (Instruction *OP = dyn_cast(*OI)) + assert(!T->encloses(OP->getParent()) && + "Loop control uses value defined in body task."); + }); + // If this operand is not defined in the header or latch, it's an input. + if (definedOutsideBlocks(*OI, BlocksToCheck)) + LoopInputs.insert(*OI); + } + } + } + + return LoopInputs; +} + +/// Replaces the Tapir loop \p TL, with associated TaskOutlineInfo \p Out, with +/// a call or invoke to the outlined helper function created for \p TL. +Instruction *llvm::replaceLoopWithCallToOutline( + TapirLoopInfo *TL, TaskOutlineInfo &Out, + SmallVectorImpl &OutlineInputs) { + // Remove any dependencies from the detach unwind of T code to T's parent. + unlinkTaskEHFromParent(TL->getTask()); + + LLVM_DEBUG({ + dbgs() << "Creating call with arguments:\n"; + for (Value *V : OutlineInputs) + dbgs() << "\t" << *V << "\n"; + }); + + Loop *L = TL->getLoop(); + // Add call to new helper function in original function. + if (!Out.ReplUnwind) { + // Common case. Insert a call to the outline immediately before the detach. + CallInst *TopCall; + // Create call instruction. + IRBuilder<> Builder(Out.ReplCall); + TopCall = Builder.CreateCall(Out.Outline, OutlineInputs); + // Use a fast calling convention for the outline. + TopCall->setCallingConv(Out.Outline->getCallingConv()); + TopCall->setDebugLoc(TL->getDebugLoc()); + if (Out.Outline->doesNotThrow()) + TopCall->setDoesNotThrow(); + // Replace the loop with an unconditional branch to its exit. + L->getHeader()->removePredecessor(Out.ReplCall->getParent()); + ReplaceInstWithInst(Out.ReplCall, BranchInst::Create(Out.ReplRet)); + return TopCall; + } else { + // The detach might catch an exception from the task. Replace the detach + // with an invoke of the outline. + InvokeInst *TopCall; + + // Create invoke instruction. The ordinary return of the invoke is the + // detach's continuation, and the unwind return is the detach's unwind. + TopCall = InvokeInst::Create(Out.Outline, Out.ReplRet, Out.ReplUnwind, + OutlineInputs); + // Use a fast calling convention for the outline. + TopCall->setCallingConv(Out.Outline->getCallingConv()); + TopCall->setDebugLoc(TL->getDebugLoc()); + // Replace the loop with the invoke. + L->getHeader()->removePredecessor(Out.ReplCall->getParent()); + ReplaceInstWithInst(Out.ReplCall, TopCall); + // Add invoke parent as a predecessor for all Phi nodes in ReplUnwind. + for (PHINode &Phi : Out.ReplUnwind->phis()) + Phi.addIncoming(Phi.getIncomingValueForBlock(L->getHeader()), + TopCall->getParent()); + return TopCall; + } +} + +bool TapirTarget::shouldProcessFunction(const Function &F) const { + if (F.getName() == "main") + return true; + + if (canDetach(&F)) + return true; + + for (const Instruction &I : instructions(&F)) + if (const IntrinsicInst *II = dyn_cast(&I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::tapir_loop_grainsize: + case Intrinsic::task_frameaddress: + case Intrinsic::tapir_runtime_start: + case Intrinsic::tapir_runtime_end: + return true; + default: + break; + } + } + + return false; +} + +void TapirTarget::lowerTaskFrameAddrCall(CallInst *TaskFrameAddrCall) { + // By default, replace calls to task_frameaddress with ordinary calls to the + // frameaddress intrinsic. + TaskFrameAddrCall->setCalledFunction(Intrinsic::getDeclaration( + &M, Intrinsic::frameaddress, PointerType::getInt8PtrTy(M.getContext()))); +} + +void TapirTarget::lowerTapirRTCalls(SmallVectorImpl &TapirRTCalls, + Function &F, BasicBlock *TFEntry) { + // By default, do nothing with tapir_runtime_{start,end} calls. + return; +} + +/// Process the Tapir instructions in an ordinary (non-spawning and not spawned) +/// function \p F directly. +bool TapirTarget::processOrdinaryFunction(Function &F, BasicBlock *TFEntry) { + // By default, do no special processing for ordinary functions. Instead, the + // function will be processed using TapirToTargetImpl::processSimpleABI(). + return false; +} diff --git a/llvm/lib/Transforms/Tapir/OMPTaskABI.cpp b/llvm/lib/Transforms/Tapir/OMPTaskABI.cpp new file mode 100644 index 000000000000000..c0fdab7440b8443 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/OMPTaskABI.cpp @@ -0,0 +1,594 @@ +//===- OMPTaskABI.cpp - Generic interface to various runtime systems--------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the OMP Task ABI to convert Tapir instructions to calls +// into kmpc task runtime calls. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/OMPTaskABI.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ModRef.h" +#include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/EscapeEnumerator.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "omptaskabi" + +extern cl::opt DebugABICalls; + +static cl::opt ClRuntimeBCPath( + "omp-bc-path", cl::init(""), + cl::desc("Path to the bitcode file for the runtime ABI"), + cl::Hidden); + +static const StringRef StackFrameName = "__rts_sf"; + +namespace { + +// Custom DiagnosticInfo for linking the Lambda ABI bitcode file. +class OMPTaskABILinkDiagnosticInfo : public DiagnosticInfo { + const Module *SrcM; + const Twine &Msg; + +public: + OMPTaskABILinkDiagnosticInfo(DiagnosticSeverity Severity, const Module *SrcM, + const Twine &Msg) + : DiagnosticInfo(DK_Lowering, Severity), SrcM(SrcM), Msg(Msg) {} + void print(DiagnosticPrinter &DP) const override { + DP << "linking module '" << SrcM->getModuleIdentifier() << "': " << Msg; + } +}; + +// Custom DiagnosticHandler to handle diagnostics arising when linking the +// Lambda ABI bitcode file. +class OMPTaskABIDiagnosticHandler final : public DiagnosticHandler { + const Module *SrcM; + DiagnosticHandler *OrigHandler; + +public: + OMPTaskABIDiagnosticHandler(const Module *SrcM, + DiagnosticHandler *OrigHandler) + : SrcM(SrcM), OrigHandler(OrigHandler) {} + + bool handleDiagnostics(const DiagnosticInfo &DI) override { + if (DI.getKind() != DK_Linker) + return OrigHandler->handleDiagnostics(DI); + + std::string MsgStorage; + { + raw_string_ostream Stream(MsgStorage); + DiagnosticPrinterRawOStream DP(Stream); + DI.print(DP); + } + return OrigHandler->handleDiagnostics( + OMPTaskABILinkDiagnosticInfo(DI.getSeverity(), SrcM, MsgStorage)); + } +}; + +// Structure recording information about runtime ABI functions. +struct RTSFnDesc { + StringRef FnName; + FunctionType *FnType; + FunctionCallee &FnCallee; +}; +} // namespace + +// void OMPTaskABI::setOptions(const TapirTargetOptions &Options) { +// if (!isa(Options)) +// return; + +// const OMPTaskABIOptions &OptionsCast = cast(Options); + +// // Get the path to the runtime bitcode file. +// RuntimeBCPath = OptionsCast.getRuntimeBCPath(); +// } + +void OMPTaskABI::prepareModule() { + LLVMContext &C = M.getContext(); + const DataLayout &DL = DestM.getDataLayout(); + Type *Int8Ty = Type::getInt8Ty(C); + Type *Int16Ty = Type::getInt16Ty(C); + Type *Int32Ty = Type::getInt32Ty(C); + Type *Int64Ty = Type::getInt64Ty(C); + + // If a runtime bitcode path is given via the command line, use it. + if ("" != ClRuntimeBCPath) + RuntimeBCPath = ClRuntimeBCPath; + + if ("" == RuntimeBCPath) { + C.emitError("OMPTaskABI: No bitcode ABI file given."); + return; + } + + LLVM_DEBUG(dbgs() << "Using external bitcode file for Lambda ABI: " + << RuntimeBCPath << "\n"); + SMDiagnostic SMD; + + // Parse the bitcode file. This call imports structure definitions, but not + // function definitions. + if (std::unique_ptr ExternalModule = + parseIRFile(RuntimeBCPath, SMD, C)) { + // Get the original DiagnosticHandler for this context. + std::unique_ptr OrigDiagHandler = + C.getDiagnosticHandler(); + + // Setup an OMPTaskABIDiagnosticHandler for this context, to handle + // diagnostics that arise from linking ExternalModule. + C.setDiagnosticHandler(std::make_unique( + ExternalModule.get(), OrigDiagHandler.get())); + + // Link the external module into the current module, copying over global + // values. + // + // TODO: Consider restructuring the import process to use + // Linker::Flags::LinkOnlyNeeded to copy over only the necessary contents + // from the external module. + bool Fail = Linker::linkModules( + M, std::move(ExternalModule), Linker::Flags::None, + [](Module &M, const StringSet<> &GVS) { + for (StringRef GVName : GVS.keys()) { + LLVM_DEBUG(dbgs() << "Linking global value " << GVName << "\n"); + if (Function *Fn = M.getFunction(GVName)) { + if (!Fn->isDeclaration() && !Fn->hasComdat()) + // We set the function's linkage as available_externally, so + // that subsequent optimizations can remove these definitions + // from the module. We don't want this module redefining any of + // these symbols, even if they aren't inlined, because the + // Lambda runtime library will provide those definitions later. + Fn->setLinkage(Function::AvailableExternallyLinkage); + } else if (GlobalVariable *G = M.getGlobalVariable(GVName)) { + if (!G->isDeclaration() && !G->hasComdat()) + G->setLinkage(GlobalValue::AvailableExternallyLinkage); + } + } + }); + if (Fail) + C.emitError("OMPTaskABI: Failed to link bitcode ABI file: " + + Twine(RuntimeBCPath)); + + // Restore the original DiagnosticHandler for this context. + C.setDiagnosticHandler(std::move(OrigDiagHandler)); + } else { + C.emitError("OMPTaskABI: Failed to parse bitcode ABI file: " + + Twine(RuntimeBCPath)); + } + + // Get or create local definitions of RTS structure types. + const char *StackFrameName = "struct.__rts_stack_frame"; + StackFrameTy = StructType::lookupOrCreate(C, StackFrameName); + + const char *TaskTyName = "struct.kmp_task"; + TaskTy = StructType::lookupOrCreate(C, TaskTyName); + + PointerType *StackFramePtrTy = PointerType::getUnqual(StackFrameTy); + Type *VoidTy = Type::getVoidTy(C); + Type *VoidPtrTy = Type::getInt8PtrTy(C); + + // Define the types of the RTS functions. + FunctionType *RTSFnTy = FunctionType::get(VoidTy, {StackFramePtrTy}, false); + SpawnBodyFnArgTy = VoidPtrTy; + Type *IntPtrTy = DL.getIntPtrType(C); + SpawnBodyFnArgSizeTy = IntPtrTy; + SpawnBodyFnTy = FunctionType::get(Int32Ty, {Int32Ty, VoidPtrTy}, false); + FunctionType *ArgsFromTaskFnTy = + FunctionType::get(VoidPtrTy, {VoidPtrTy, IntPtrTy}, false); + FunctionType *SpawnFnTy = + FunctionType::get(VoidTy, + {StackFramePtrTy, PointerType::getUnqual(SpawnBodyFnTy), + SpawnBodyFnArgTy, SpawnBodyFnArgSizeTy, IntPtrTy}, + false); + FunctionType *Grainsize8FnTy = FunctionType::get(Int8Ty, {Int8Ty}, false); + FunctionType *Grainsize16FnTy = FunctionType::get(Int16Ty, {Int16Ty}, false); + FunctionType *Grainsize32FnTy = FunctionType::get(Int32Ty, {Int32Ty}, false); + FunctionType *Grainsize64FnTy = FunctionType::get(Int64Ty, {Int64Ty}, false); + FunctionType *WorkerInfoTy = FunctionType::get(Int32Ty, {}, false); + + // Create an array of RTS functions, with their associated types and + // FunctionCallee member variables in the OMPTaskABI class. + RTSFnDesc RTSFunctions[] = { + {"__rts_enter_frame", RTSFnTy, RTSEnterFrame}, + {"__rts_get_args_from_task", ArgsFromTaskFnTy, RTSGetArgsFromTask}, + {"__rts_spawn", SpawnFnTy, RTSSpawn}, + {"__rts_sync", RTSFnTy, RTSSync}, + {"__rts_sync_nothrow", RTSFnTy, RTSSyncNoThrow}, + {"__rts_loop_grainsize_8", Grainsize8FnTy, RTSLoopGrainsize8}, + {"__rts_loop_grainsize_16", Grainsize16FnTy, RTSLoopGrainsize16}, + {"__rts_loop_grainsize_32", Grainsize32FnTy, RTSLoopGrainsize32}, + {"__rts_loop_grainsize_64", Grainsize64FnTy, RTSLoopGrainsize64}, + {"__rts_get_num_workers", WorkerInfoTy, RTSGetNumWorkers}, + {"__rts_get_worker_id", WorkerInfoTy, RTSGetWorkerID}, + }; + + // Add attributes to internalized functions. + for (RTSFnDesc FnDesc : RTSFunctions) { + assert(!FnDesc.FnCallee && "Redefining RTS function"); + FnDesc.FnCallee = M.getOrInsertFunction(FnDesc.FnName, FnDesc.FnType); + assert(isa(FnDesc.FnCallee.getCallee()) && + "Runtime function is not a function"); + Function *Fn = cast(FnDesc.FnCallee.getCallee()); + + Fn->setDoesNotThrow(); + + // Unless we're debugging, mark the function as always_inline. This + // attribute is required for some functions, but is helpful for all + // functions. + if (!DebugABICalls) + Fn->addFnAttr(Attribute::AlwaysInline); + else + Fn->removeFnAttr(Attribute::AlwaysInline); + + if (Fn->getName() == "__rts_get_num_workers" || + Fn->getName() == "__rts_get_worker_id") { + Fn->setLinkage(Function::InternalLinkage); + } + } + + // If no valid bitcode file was found fill in the missing pieces. + // An error should have been emitted already unless the user + // set DebugABICalls. + + if (StackFrameTy->isOpaque()) { + // Create a dummy __rts_stack_frame structure + StackFrameTy->setBody(Int64Ty); + } + // Create declarations of all RTS functions, and add basic attributes to those + // declarations. + for (RTSFnDesc FnDesc : RTSFunctions) { + if (FnDesc.FnCallee) + continue; + FnDesc.FnCallee = M.getOrInsertFunction(FnDesc.FnName, FnDesc.FnType); + assert(isa(FnDesc.FnCallee.getCallee()) && + "RTS function is not a function"); + Function *Fn = cast(FnDesc.FnCallee.getCallee()); + + Fn->setDoesNotThrow(); + } +} + +void OMPTaskABI::addHelperAttributes(Function &Helper) { + // We'll be creating a new helper function, and we want to inline this helper + // function into that one to reduce overheads. + Helper.addFnAttr(Attribute::AlwaysInline); + // If the helper uses an argument structure, then it is not a write-only + // function. + if (getArgStructMode() != ArgStructMode::None) { + Helper.removeFnAttr(Attribute::WriteOnly); + Helper.setMemoryEffects( + MemoryEffects(MemoryEffects::Location::Other, ModRefInfo::ModRef)); + } + // Note that the address of the helper is unimportant. + Helper.setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + // The helper is internal to this module. We use internal linkage, rather + // than private linkage, so that tools can still reference the helper + // function. + Helper.setLinkage(GlobalValue::InternalLinkage); +} + +// Check whether the allocation of a __rts_stack_frame can be inserted after +// instruction \p I. +static bool skipInstruction(const Instruction &I) { + if (isa(I)) + return true; + + if (isa(I)) + return true; + + if (const IntrinsicInst *II = dyn_cast(&I)) { + // Skip simple intrinsics + switch(II->getIntrinsicID()) { + case Intrinsic::annotation: + case Intrinsic::assume: + case Intrinsic::sideeffect: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: + case Intrinsic::is_constant: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::objectsize: + case Intrinsic::ptr_annotation: + case Intrinsic::var_annotation: + case Intrinsic::experimental_gc_result: + case Intrinsic::experimental_gc_relocate: + case Intrinsic::experimental_noalias_scope_decl: + case Intrinsic::syncregion_start: + case Intrinsic::taskframe_create: + return true; + default: + return false; + } + } + + return false; +} + +// Scan the basic block \p B to find a point to insert the allocation of a +// __rts_stack_frame. +static Instruction *getStackFrameInsertPt(BasicBlock &B) { + BasicBlock::iterator BI(B.getFirstInsertionPt()); + BasicBlock::const_iterator BE(B.end()); + + // Scan the basic block for the first instruction we should not skip. + while (BI != BE) { + if (!skipInstruction(*BI)) { + return &*BI; + } + ++BI; + } + + // We reached the end of the basic block; return the terminator. + return B.getTerminator(); +} + +/// Create the __rts_stack_frame for the spawning function. +Value *OMPTaskABI::CreateStackFrame(Function &F) { + const DataLayout &DL = F.getParent()->getDataLayout(); + Type *SFTy = StackFrameTy; + + IRBuilder<> B(getStackFrameInsertPt(F.getEntryBlock())); + AllocaInst *SF = B.CreateAlloca(SFTy, DL.getAllocaAddrSpace(), + /*ArraySize*/ nullptr, + /*Name*/ StackFrameName); + + SF->setAlignment(StackFrameAlign); + + return SF; +} + +Value* OMPTaskABI::GetOrCreateStackFrame(Function &F) { + if (DetachCtxToStackFrame.count(&F)) + return DetachCtxToStackFrame[&F]; + + Value *SF = CreateStackFrame(F); + DetachCtxToStackFrame[&F] = SF; + + return SF; +} + +// Insert a call in Function F to __rts_enter_frame to initialize the +// __rts_stack_frame in F. If TaskFrameCreate is nonnull, the call to +// __rts_enter_frame is inserted at TaskFramecreate. +CallInst *OMPTaskABI::InsertStackFramePush(Function &F, + Instruction *TaskFrameCreate, + bool Helper) { + Instruction *SF = cast(GetOrCreateStackFrame(F)); + + BasicBlock::iterator InsertPt = ++SF->getIterator(); + IRBuilder<> B(&(F.getEntryBlock()), InsertPt); + if (TaskFrameCreate) + B.SetInsertPoint(TaskFrameCreate); + if (!B.getCurrentDebugLocation()) { + // Try to find debug information later in this block for the ABI call. + BasicBlock::iterator BI = B.GetInsertPoint(); + BasicBlock::const_iterator BE(B.GetInsertBlock()->end()); + while (BI != BE) { + if (DebugLoc Loc = BI->getDebugLoc()) { + B.SetCurrentDebugLocation(Loc); + break; + } + ++BI; + } + } + + Value *Args[1] = {SF}; + return B.CreateCall(RTSEnterFrame, Args); +} + +// Insert a call in Function F to pop the stack frame. +// +// PromoteCallsToInvokes dictates whether call instructions that can throw are +// promoted to invoke instructions prior to inserting the epilogue-function +// calls. +void OMPTaskABI::InsertStackFramePop(Function &F, bool PromoteCallsToInvokes, + bool InsertPauseFrame, bool Helper) {} + +/// Lower a call to get the grainsize of a Tapir loop. +Value *OMPTaskABI::lowerGrainsizeCall(CallInst *GrainsizeCall) { + Value *Limit = GrainsizeCall->getArgOperand(0); + IRBuilder<> Builder(GrainsizeCall); + + // Select the appropriate __rts_grainsize function, based on the type. + FunctionCallee RTSGrainsizeCall; + if (GrainsizeCall->getType()->isIntegerTy(8)) + RTSGrainsizeCall = RTSLoopGrainsize8; + else if (GrainsizeCall->getType()->isIntegerTy(16)) + RTSGrainsizeCall = RTSLoopGrainsize16; + else if (GrainsizeCall->getType()->isIntegerTy(32)) + RTSGrainsizeCall = RTSLoopGrainsize32; + else if (GrainsizeCall->getType()->isIntegerTy(64)) + RTSGrainsizeCall = RTSLoopGrainsize64; + else + llvm_unreachable("No RTSGrainsize call matches type for Tapir loop."); + + Value *Grainsize = Builder.CreateCall(RTSGrainsizeCall, Limit); + + // Replace uses of grainsize intrinsic call with this grainsize value. + GrainsizeCall->replaceAllUsesWith(Grainsize); + return Grainsize; +} + +// Lower a sync instruction SI. +void OMPTaskABI::lowerSync(SyncInst &SI) { + Function &Fn = *SI.getFunction(); + if (!DetachCtxToStackFrame[&Fn]) + // If we have not created a stackframe for this function, then we don't need + // to handle the sync. + return; + + Value *SF = GetOrCreateStackFrame(Fn); + Value *Args[] = { SF }; + assert(Args[0] && "sync used in function without frame!"); + + Instruction *SyncUnwind = nullptr; + BasicBlock *SyncCont = SI.getSuccessor(0); + BasicBlock *SyncUnwindDest = nullptr; + // Determine whether a sync.unwind immediately follows SI. + if (InvokeInst *II = + dyn_cast(SyncCont->getFirstNonPHIOrDbgOrLifetime())) { + if (isSyncUnwind(II)) { + SyncUnwind = II; + SyncCont = II->getNormalDest(); + SyncUnwindDest = II->getUnwindDest(); + } + } + + CallBase *CB; + if (!SyncUnwindDest) { + if (Fn.doesNotThrow()) + CB = CallInst::Create(RTSSyncNoThrow, Args, "", + /*insert before*/ &SI); + else + CB = CallInst::Create(RTSSync, Args, "", /*insert before*/ &SI); + + BranchInst::Create(SyncCont, CB->getParent()); + } else { + CB = InvokeInst::Create(RTSSync, SyncCont, SyncUnwindDest, Args, "", + /*insert before*/ &SI); + for (PHINode &PN : SyncCont->phis()) + PN.addIncoming(PN.getIncomingValueForBlock(SyncUnwind->getParent()), + SI.getParent()); + for (PHINode &PN : SyncUnwindDest->phis()) + PN.addIncoming(PN.getIncomingValueForBlock(SyncUnwind->getParent()), + SI.getParent()); + } + CB->setDebugLoc(SI.getDebugLoc()); + SI.eraseFromParent(); +} + +bool OMPTaskABI::preProcessFunction(Function &F, TaskInfo &TI, + bool ProcessingTapirLoops) { + return false; +} +void OMPTaskABI::postProcessFunction(Function &F, bool ProcessingTapirLoops) {} +void OMPTaskABI::postProcessHelper(Function &F) {} + +void OMPTaskABI::preProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, + bool IsSpawner, BasicBlock *TFEntry) { + if (IsSpawner) + InsertStackFramePush(F, TaskFrameCreate, /*Helper*/ true); +} + +void OMPTaskABI::postProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, + bool IsSpawner, BasicBlock *TFEntry) { + if (IsSpawner) + InsertStackFramePop(F, /*PromoteCallsToInvokes*/ true, + /*InsertPauseFrame*/ true, /*Helper*/ true); +} + +void OMPTaskABI::preProcessRootSpawner(Function &F, BasicBlock *TFEntry) { + InsertStackFramePush(F); +} + +void OMPTaskABI::postProcessRootSpawner(Function &F, BasicBlock *TFEntry) { + InsertStackFramePop(F, /*PromoteCallsToInvokes*/ false, + /*InsertPauseFrame*/ false, /*Helper*/ false); +} + +void OMPTaskABI::processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) { + const DataLayout &DL = DestM.getDataLayout(); + CallBase *ReplCall = cast(TOI.ReplCall); + Function *Helper = TOI.Outline; + + Function &F = *ReplCall->getFunction(); + Value *SF = DetachCtxToStackFrame[&F]; + assert(SF && "No frame found for spawning task"); + + // Create OMP function helper to match required interface. + LLVMContext &C = M.getContext(); + Function *OMPTask = + Function::Create(SpawnBodyFnTy, GlobalValue::InternalLinkage, + "omp_task." + Helper->getName(), &M); + + { + Function *NewFunc = OMPTask; + Function *OldFunc = Helper; + + // Copy all attributes other than those stored in the AttributeSet. We need + // to remap the parameter indices of the AttributeSet. + AttributeList NewAttrs = NewFunc->getAttributes(); + NewFunc->copyAttributesFrom(OldFunc); + NewFunc->setAttributes(NewAttrs); + + SmallVector NewArgAttrs(NewFunc->arg_size()); + AttributeList OldAttrs = OldFunc->getAttributes(); + + NewFunc->setAttributes( + AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttrs(), + OldAttrs.getRetAttrs(), NewArgAttrs)); + } + + // Get the alignment of the helper arguments. The bitcode-ABI functions may + // use the alignment to align the shared variables in the storage allocated by + // the OpenMP runtime, especially to accommodate vector arguments. + AllocaInst *ArgAlloca = cast(ReplCall->getArgOperand(0)); + uint64_t Alignment = + DL.getPrefTypeAlign(ArgAlloca->getAllocatedType()).value(); + + { + // Populate the OMP function helper. + BasicBlock *OMPTaskBB = BasicBlock::Create(C, "entry", OMPTask); + IRBuilder<> IRB(ReturnInst::Create( + C, Constant::getNullValue(Type::getInt32Ty(C)), OMPTaskBB)); + // Get the helper arguments from the task structure. + Value *ArgsFromTask = IRB.CreateCall( + RTSGetArgsFromTask, {OMPTask->getArg(1), IRB.getInt64(Alignment)}); + Value *ArgsCast = IRB.CreateBitOrPointerCast( + ArgsFromTask, ArgAlloca->getType()); + // Insert call to helper in OMP function helper. + CallInst *Call = IRB.CreateCall(ReplCall->getCalledFunction(), {ArgsCast}); + Call->setCallingConv(ReplCall->getCallingConv()); + } + + // Replace the original call to the helper with a call to __rts_spawn. + IRBuilder<> B(ReplCall); + Value *ArgCast = B.CreateBitOrPointerCast(ArgAlloca, SpawnBodyFnArgTy); + auto ArgSize = ArgAlloca->getAllocationSizeInBits(DL); + assert(ArgSize && + "Could not determine size of compiler-generated ArgStruct."); + Value *ArgSizeVal = ConstantInt::get(SpawnBodyFnArgSizeTy, *ArgSize / 8); + + if (InvokeInst *II = dyn_cast(ReplCall)) { + B.CreateInvoke(RTSSpawn, II->getNormalDest(), II->getUnwindDest(), + {SF, OMPTask, ArgCast, ArgSizeVal, B.getInt64(Alignment)}); + } else { + B.CreateCall(RTSSpawn, + {SF, OMPTask, ArgCast, ArgSizeVal, B.getInt64(Alignment)}); + } + + ReplCall->eraseFromParent(); +} diff --git a/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp new file mode 100644 index 000000000000000..8794b9a17ac3a49 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp @@ -0,0 +1,1128 @@ +//===- OpenCilkABI.cpp - Interface to the OpenCilk runtime system------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the OpenCilk ABI to convert Tapir instructions to calls +// into the OpenCilk runtime system. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/OpenCilkABI.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ModRef.h" +#include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/EscapeEnumerator.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "opencilk" + +extern cl::opt DebugABICalls; + +static cl::opt UseOpenCilkRuntimeBC( + "use-opencilk-runtime-bc", cl::init(true), + cl::desc("Use a bitcode file for the OpenCilk runtime ABI"), cl::Hidden); +static cl::opt ClOpenCilkRuntimeBCPath( + "opencilk-runtime-bc-path", cl::init(""), + cl::desc("Path to the bitcode file for the OpenCilk runtime ABI"), + cl::Hidden); + +#define CILKRTS_FUNC(name) Get__cilkrts_##name() + +static const StringRef StackFrameName = "__cilkrts_sf"; + +OpenCilkABI::OpenCilkABI(Module &M) : TapirTarget(M) {} + +// Helper function to fix the implementation of __cilk_sync. In particular, +// this fixup ensures that __cilk_sync, and specific __cilkrts method calls +// therein, appear that they may throw an exception. Since the bitcode-ABI file +// is built from C code, it won't necessarily be marked appropriately for +// exception handling. +static void fixCilkSyncFn(Module &M, Function *Fn) { + Fn->removeFnAttr(Attribute::NoUnwind); + Function *ExceptionRaiseFn = M.getFunction("__cilkrts_check_exception_raise"); + Function *ExceptionResumeFn = M.getFunction("__cilkrts_check_exception_resume"); + for (Instruction &I : instructions(Fn)) + if (CallBase *CB = dyn_cast(&I)) + if (CB->getCalledFunction() == ExceptionRaiseFn || + CB->getCalledFunction() == ExceptionResumeFn) + CB->removeFnAttr(Attribute::NoUnwind); +} + +namespace { + +// Custom DiagnosticInfo for linking the OpenCilk ABI bitcode file. +class OpenCilkABILinkDiagnosticInfo : public DiagnosticInfo { + const Module *SrcM; + const Twine &Msg; + +public: + OpenCilkABILinkDiagnosticInfo(DiagnosticSeverity Severity, const Module *SrcM, + const Twine &Msg) + : DiagnosticInfo(DK_Lowering, Severity), SrcM(SrcM), Msg(Msg) {} + void print(DiagnosticPrinter &DP) const override { + DP << "linking module '" << SrcM->getModuleIdentifier() << "': " << Msg; + } +}; + +// Custom DiagnosticHandler to handle diagnostics arising when linking the +// OpenCilk ABI bitcode file. +class OpenCilkABIDiagnosticHandler final : public DiagnosticHandler { + const Module *SrcM; + DiagnosticHandler *OrigHandler; + +public: + OpenCilkABIDiagnosticHandler(const Module *SrcM, + DiagnosticHandler *OrigHandler) + : SrcM(SrcM), OrigHandler(OrigHandler) {} + + bool handleDiagnostics(const DiagnosticInfo &DI) override { + if (DI.getKind() != DK_Linker) + return OrigHandler->handleDiagnostics(DI); + + std::string MsgStorage; + { + raw_string_ostream Stream(MsgStorage); + DiagnosticPrinterRawOStream DP(Stream); + DI.print(DP); + } + return OrigHandler->handleDiagnostics( + OpenCilkABILinkDiagnosticInfo(DI.getSeverity(), SrcM, MsgStorage)); + } +}; + +// Structure recording information about Cilk ABI functions. +struct CilkRTSFnDesc { + StringRef FnName; + FunctionType *FnType; + FunctionCallee &FnCallee; +}; + +} // namespace + +void OpenCilkABI::setOptions(const TapirTargetOptions &Options) { + if (!isa(Options)) + return; + + const OpenCilkABIOptions &OptionsCast = cast(Options); + + // Get the path to the runtime bitcode file. + RuntimeBCPath = OptionsCast.getRuntimeBCPath(); +} + +void OpenCilkABI::prepareModule() { + LLVMContext &C = M.getContext(); + Type *Int8Ty = Type::getInt8Ty(C); + Type *Int16Ty = Type::getInt16Ty(C); + Type *Int32Ty = Type::getInt32Ty(C); + Type *Int64Ty = Type::getInt64Ty(C); + + if (UseOpenCilkRuntimeBC) { + // If a runtime bitcode path is given via the command line, use it. + if ("" != ClOpenCilkRuntimeBCPath) + RuntimeBCPath = ClOpenCilkRuntimeBCPath; + + if ("" == RuntimeBCPath) + C.emitError("OpenCilkABI: No OpenCilk bitcode ABI file given."); + + LLVM_DEBUG(dbgs() << "Using external bitcode file for OpenCilk ABI: " + << RuntimeBCPath << "\n"); + SMDiagnostic SMD; + + // Parse the bitcode file. This call imports structure definitions, but not + // function definitions. + if (std::unique_ptr ExternalModule = + parseIRFile(RuntimeBCPath, SMD, C)) { + // Get the original DiagnosticHandler for this context. + std::unique_ptr OrigDiagHandler = + C.getDiagnosticHandler(); + + // Setup an OpenCilkABIDiagnosticHandler for this context, to handle + // diagnostics that arise from linking ExternalModule. + C.setDiagnosticHandler(std::make_unique( + ExternalModule.get(), OrigDiagHandler.get())); + + // Link the external module into the current module, copying over global + // values. + // + // TODO: Consider restructuring the import process to use + // Linker::Flags::LinkOnlyNeeded to copy over only the necessary contents + // from the external module. + bool Fail = Linker::linkModules( + M, std::move(ExternalModule), Linker::Flags::None, + [](Module &M, const StringSet<> &GVS) { + for (StringRef GVName : GVS.keys()) { + LLVM_DEBUG(dbgs() << "Linking global value " << GVName << "\n"); + if (Function *Fn = M.getFunction(GVName)) { + if (!Fn->isDeclaration()) + // We set the function's linkage as available_externally, so + // that subsequent optimizations can remove these definitions + // from the module. We don't want this module redefining any of + // these symbols, even if they aren't inlined, because the + // OpenCilk runtime library will provide those definitions + // later. + Fn->setLinkage(Function::AvailableExternallyLinkage); + } else if (GlobalVariable *G = M.getGlobalVariable(GVName)) { + if (!G->isDeclaration()) + G->setLinkage(GlobalValue::AvailableExternallyLinkage); + } + } + }); + if (Fail) + C.emitError("OpenCilkABI: Failed to link bitcode ABI file: " + + Twine(RuntimeBCPath)); + + // Restore the original DiagnosticHandler for this context. + C.setDiagnosticHandler(std::move(OrigDiagHandler)); + } else { + C.emitError("OpenCilkABI: Failed to parse bitcode ABI file: " + + Twine(RuntimeBCPath)); + } + } + + // Get or create local definitions of Cilk RTS structure types. + const char *StackFrameName = "struct.__cilkrts_stack_frame"; + StackFrameTy = StructType::lookupOrCreate(C, StackFrameName); + WorkerTy = StructType::lookupOrCreate(C, "struct.__cilkrts_worker"); + + PointerType *StackFramePtrTy = PointerType::getUnqual(StackFrameTy); + Type *VoidTy = Type::getVoidTy(C); + Type *VoidPtrTy = Type::getInt8PtrTy(C); + + // Define the types of the CilkRTS functions. + FunctionType *CilkRTSFnTy = + FunctionType::get(VoidTy, {StackFramePtrTy}, false); + FunctionType *CilkPrepareSpawnFnTy = + FunctionType::get(Int32Ty, {StackFramePtrTy}, false); + FunctionType *CilkRTSEnterLandingpadFnTy = + FunctionType::get(VoidTy, {StackFramePtrTy, Int32Ty}, false); + FunctionType *CilkRTSPauseFrameFnTy = FunctionType::get( + VoidTy, {StackFramePtrTy, PointerType::getInt8PtrTy(C)}, false); + FunctionType *Grainsize8FnTy = FunctionType::get(Int8Ty, {Int8Ty}, false); + FunctionType *Grainsize16FnTy = FunctionType::get(Int16Ty, {Int16Ty}, false); + FunctionType *Grainsize32FnTy = FunctionType::get(Int32Ty, {Int32Ty}, false); + FunctionType *Grainsize64FnTy = FunctionType::get(Int64Ty, {Int64Ty}, false); + FunctionType *LookupTy = FunctionType::get( + VoidPtrTy, {VoidPtrTy, Int64Ty, VoidPtrTy, VoidPtrTy}, false); + FunctionType *UnregTy = FunctionType::get(VoidTy, {VoidPtrTy}, false); + FunctionType *Reg32Ty = + FunctionType::get(VoidTy, {VoidPtrTy, Int32Ty, VoidPtrTy, + VoidPtrTy}, false); + FunctionType *Reg64Ty = + FunctionType::get(VoidTy, {VoidPtrTy, Int64Ty, VoidPtrTy, + VoidPtrTy}, false); + + // Create an array of CilkRTS functions, with their associated types and + // FunctionCallee member variables in the OpenCilkABI class. + CilkRTSFnDesc CilkRTSFunctions[] = { + {"__cilkrts_enter_frame", CilkRTSFnTy, CilkRTSEnterFrame}, + {"__cilkrts_enter_frame_helper", CilkRTSFnTy, CilkRTSEnterFrameHelper}, + {"__cilkrts_detach", CilkRTSFnTy, CilkRTSDetach}, + {"__cilkrts_leave_frame", CilkRTSFnTy, CilkRTSLeaveFrame}, + {"__cilkrts_leave_frame_helper", CilkRTSFnTy, CilkRTSLeaveFrameHelper}, + {"__cilk_prepare_spawn", CilkPrepareSpawnFnTy, CilkPrepareSpawn}, + {"__cilk_sync", CilkRTSFnTy, CilkSync}, + {"__cilk_sync_nothrow", CilkRTSFnTy, CilkSyncNoThrow}, + {"__cilk_parent_epilogue", CilkRTSFnTy, CilkParentEpilogue}, + {"__cilk_helper_epilogue", CilkRTSFnTy, CilkHelperEpilogue}, + {"__cilkrts_enter_landingpad", CilkRTSEnterLandingpadFnTy, + CilkRTSEnterLandingpad}, + {"__cilkrts_pause_frame", CilkRTSPauseFrameFnTy, CilkRTSPauseFrame}, + {"__cilk_helper_epilogue_exn", CilkRTSPauseFrameFnTy, + CilkHelperEpilogueExn}, + {"__cilkrts_cilk_for_grainsize_8", Grainsize8FnTy, + CilkRTSCilkForGrainsize8}, + {"__cilkrts_cilk_for_grainsize_16", Grainsize16FnTy, + CilkRTSCilkForGrainsize16}, + {"__cilkrts_cilk_for_grainsize_32", Grainsize32FnTy, + CilkRTSCilkForGrainsize32}, + {"__cilkrts_cilk_for_grainsize_64", Grainsize64FnTy, + CilkRTSCilkForGrainsize64}, + {"__cilkrts_reducer_lookup", LookupTy, CilkRTSReducerLookup}, + {"__cilkrts_reducer_register_32", Reg32Ty, CilkRTSReducerRegister32}, + {"__cilkrts_reducer_register_64", Reg64Ty, CilkRTSReducerRegister64}, + {"__cilkrts_reducer_unregister", UnregTy, CilkRTSReducerUnregister}, + }; + + if (UseOpenCilkRuntimeBC) { + // Add attributes to internalized functions. + for (CilkRTSFnDesc FnDesc : CilkRTSFunctions) { + assert(!FnDesc.FnCallee && "Redefining Cilk function"); + FnDesc.FnCallee = M.getOrInsertFunction(FnDesc.FnName, FnDesc.FnType); + assert(isa(FnDesc.FnCallee.getCallee()) && + "Cilk runtime function is not a function"); + Function *Fn = cast(FnDesc.FnCallee.getCallee()); + + // Because __cilk_sync is a C function that can throw an exception, update + // its attributes specially. No other CilkRTS functions can throw an + // exception. + if ("__cilk_sync" == FnDesc.FnName) + fixCilkSyncFn(M, Fn); + else + Fn->setDoesNotThrow(); + + // Unless we're debugging, mark the function as always_inline. This + // attribute is required for some functions, but is helpful for all + // functions. + if (!DebugABICalls) + Fn->addFnAttr(Attribute::AlwaysInline); + else + Fn->removeFnAttr(Attribute::AlwaysInline); + } + if (GlobalVariable *AlignVar = + M.getGlobalVariable("__cilkrts_stack_frame_align", true)) { + StackFrameAlign = AlignVar->getAlign(); + // Mark this variable with private linkage, to avoid linker failures when + // compiling with no optimizations. + AlignVar->setLinkage(GlobalValue::PrivateLinkage); + } + } else if (!DebugABICalls) { + // The OpenCilkABI target requires the use of a bitcode ABI file to generate + // correct code. + C.emitError( + "OpenCilkABI: Bitcode ABI file required for correct code generation."); + } + + // If no valid bitcode file was found fill in the missing pieces. + // An error should have been emitted already unless the user + // set DebugABICalls. + + if (StackFrameTy->isOpaque()) { + // Create a dummy __cilkrts_stack_frame structure + StackFrameTy->setBody(Int64Ty); + } + // Create declarations of all CilkRTS functions, and add basic attributes to + // those declarations. + for (CilkRTSFnDesc FnDesc : CilkRTSFunctions) { + if (FnDesc.FnCallee) + continue; + FnDesc.FnCallee = M.getOrInsertFunction(FnDesc.FnName, FnDesc.FnType); + assert(isa(FnDesc.FnCallee.getCallee()) && + "Cilk function is not a function"); + Function *Fn = cast(FnDesc.FnCallee.getCallee()); + + // Mark all CilkRTS functions nounwind, except for __cilk_sync. + if ("__cilk_sync" == FnDesc.FnName) + Fn->removeFnAttr(Attribute::NoUnwind); + else + Fn->setDoesNotThrow(); + } +} + +void OpenCilkABI::addHelperAttributes(Function &Helper) { + // Use a fast calling convention for the helper. + Helper.setCallingConv(CallingConv::Fast); + // Inlining the helper function is not legal. + Helper.removeFnAttr(Attribute::AlwaysInline); + Helper.addFnAttr(Attribute::NoInline); + // If the helper uses an argument structure, then it is not a write-only + // function. + if (getArgStructMode() != ArgStructMode::None) { + Helper.removeFnAttr(Attribute::WriteOnly); + Helper.setMemoryEffects( + MemoryEffects(MemoryEffects::Location::Other, ModRefInfo::ModRef)); + } + // Note that the address of the helper is unimportant. + Helper.setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + // The helper is internal to this module. We use internal linkage, rather + // than private linkage, so that tools can still reference the helper + // function. + Helper.setLinkage(GlobalValue::InternalLinkage); +} + +void OpenCilkABI::remapAfterOutlining(BasicBlock *TFEntry, + ValueToValueMapTy &VMap) { + if (TapirRTCalls[TFEntry].empty()) + return; + + // Update the set of tapir.runtime.{start,end} intrinsics in the taskframe + // rooted at TFEntry to process. + SmallVector OldTapirRTCalls(TapirRTCalls[TFEntry]); + TapirRTCalls[TFEntry].clear(); + for (IntrinsicInst *II : OldTapirRTCalls) + TapirRTCalls[TFEntry].push_back(cast(VMap[II])); +} + +// Check whether the allocation of a __cilkrts_stack_frame can be inserted after +// instruction \p I. +static bool skipInstruction(const Instruction &I) { + if (isa(I)) + return true; + + if (isa(I)) + return true; + + if (const IntrinsicInst *II = dyn_cast(&I)) { + // Skip simple intrinsics + switch(II->getIntrinsicID()) { + case Intrinsic::annotation: + case Intrinsic::assume: + case Intrinsic::sideeffect: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: + case Intrinsic::is_constant: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::objectsize: + case Intrinsic::ptr_annotation: + case Intrinsic::var_annotation: + case Intrinsic::experimental_gc_result: + case Intrinsic::experimental_gc_relocate: + case Intrinsic::experimental_noalias_scope_decl: + case Intrinsic::syncregion_start: + case Intrinsic::taskframe_create: + return true; + default: + return false; + } + } + + return false; +} + +// Scan the basic block \p B to find a point to insert the allocation of a +// __cilkrts_stack_frame. +static Instruction *getStackFrameInsertPt(BasicBlock &B) { + BasicBlock::iterator BI(B.getFirstInsertionPt()); + BasicBlock::const_iterator BE(B.end()); + + // Scan the basic block for the first instruction we should not skip. + while (BI != BE) { + if (!skipInstruction(*BI)) { + return &*BI; + } + ++BI; + } + + // We reached the end of the basic block; return the terminator. + return B.getTerminator(); +} + +/// Create the __cilkrts_stack_frame for the spawning function. +Value *OpenCilkABI::CreateStackFrame(Function &F) { + const DataLayout &DL = F.getParent()->getDataLayout(); + Type *SFTy = StackFrameTy; + + IRBuilder<> B(getStackFrameInsertPt(F.getEntryBlock())); + AllocaInst *SF = B.CreateAlloca(SFTy, DL.getAllocaAddrSpace(), + /*ArraySize*/ nullptr, + /*Name*/ StackFrameName); + if (StackFrameAlign) + SF->setAlignment(StackFrameAlign.valueOrOne()); + + return SF; +} + +Value* OpenCilkABI::GetOrCreateCilkStackFrame(Function &F) { + if (DetachCtxToStackFrame.count(&F)) + return DetachCtxToStackFrame[&F]; + + Value *SF = CreateStackFrame(F); + DetachCtxToStackFrame[&F] = SF; + + return SF; +} + +// Insert a call in Function F to __cilkrts_detach at DetachPt, which must be +// after the allocation of the __cilkrts_stack_frame in F. +void OpenCilkABI::InsertDetach(Function &F, Instruction *DetachPt) { + Instruction *SF = cast(GetOrCreateCilkStackFrame(F)); + assert(SF && "No Cilk stack frame for Cilk function."); + Value *Args[1] = {SF}; + + // Scan function to see if it detaches. + LLVM_DEBUG({ + bool SimpleHelper = !canDetach(&F); + if (!SimpleHelper) + dbgs() << "NOTE: Detachable helper function itself detaches.\n"; + }); + + // Call __cilkrts_detach + IRBuilder<> IRB(DetachPt); + IRB.CreateCall(CILKRTS_FUNC(detach), Args); +} + +// Insert a call in Function F to __cilkrts_enter_frame{_helper} to initialize +// the __cilkrts_stack_frame in F. If TaskFrameCreate is nonnull, the call to +// __cilkrts_enter_frame{_helper} is inserted at TaskFramecreate. +CallInst *OpenCilkABI::InsertStackFramePush(Function &F, + Instruction *TaskFrameCreate, + bool Helper) { + Instruction *SF = cast(GetOrCreateCilkStackFrame(F)); + + BasicBlock::iterator InsertPt = ++SF->getIterator(); + IRBuilder<> B(&(F.getEntryBlock()), InsertPt); + if (TaskFrameCreate) + B.SetInsertPoint(TaskFrameCreate); + if (!B.getCurrentDebugLocation()) { + // Try to find debug information later in this block for the ABI call. + BasicBlock::iterator BI = B.GetInsertPoint(); + BasicBlock::const_iterator BE(B.GetInsertBlock()->end()); + while (BI != BE) { + if (DebugLoc Loc = BI->getDebugLoc()) { + B.SetCurrentDebugLocation(Loc); + break; + } + ++BI; + } + + // Next, try to find debug information earlier in this block. + if (!B.getCurrentDebugLocation()) { + BI = B.GetInsertPoint(); + BasicBlock::const_iterator BB(B.GetInsertBlock()->begin()); + while (BI != BB) { + --BI; + if (DebugLoc Loc = BI->getDebugLoc()) { + B.SetCurrentDebugLocation(Loc); + break; + } + } + } + } + + Value *Args[1] = {SF}; + if (Helper) + return B.CreateCall(CILKRTS_FUNC(enter_frame_helper), Args); + else + return B.CreateCall(CILKRTS_FUNC(enter_frame), Args); +} + +// Insert a call in Function F to the appropriate epilogue function. +// +// - A call to __cilk_parent_epilogue() is inserted at a return from a +// spawning function. +// +// - A call to __cilk_helper_epilogue() is inserted at a return from a +// spawn-helper function. +// +// - A call to __cilk_helper_epiluge_exn() is inserted at a resume from a +// spawn-helper function. +// +// PromoteCallsToInvokes dictates whether call instructions that can throw are +// promoted to invoke instructions prior to inserting the epilogue-function +// calls. +void OpenCilkABI::InsertStackFramePop(Function &F, bool PromoteCallsToInvokes, + bool InsertPauseFrame, bool Helper) { + Value *SF = GetOrCreateCilkStackFrame(F); + SmallPtrSet Returns; + SmallPtrSet Resumes; + + // Add eh cleanup that returns control to the runtime + EscapeEnumerator EE(F, "cilkrts_cleanup", PromoteCallsToInvokes); + while (IRBuilder<> *Builder = EE.Next()) { + if (ResumeInst *RI = dyn_cast(Builder->GetInsertPoint())) { + if (!RI->getDebugLoc()) + // Attempt to set the debug location of this resume to match one of the + // preceeding terminators. + for (const BasicBlock *Pred : predecessors(RI->getParent())) + if (const DebugLoc &Loc = Pred->getTerminator()->getDebugLoc()) { + RI->setDebugLoc(Loc); + break; + } + Resumes.insert(RI); + } + else if (ReturnInst *RI = dyn_cast(Builder->GetInsertPoint())) + Returns.insert(RI); + } + + for (ReturnInst *RI : Returns) { + if (Helper) { + CallInst::Create(GetCilkHelperEpilogueFn(), {SF}, "", RI) + ->setDebugLoc(RI->getDebugLoc()); + } else { + CallInst::Create(GetCilkParentEpilogueFn(), {SF}, "", RI) + ->setDebugLoc(RI->getDebugLoc()); + } + } + for (ResumeInst *RI : Resumes) { + if (InsertPauseFrame) { + Value *Exn = ExtractValueInst::Create(RI->getValue(), {0}, "", RI); + // If throwing an exception, pass the exception object to the epilogue + // function. + CallInst::Create(GetCilkHelperEpilogueExnFn(), {SF, Exn}, "", RI) + ->setDebugLoc(RI->getDebugLoc()); + } + } +} + +// Lower any calls to tapir.runtime.{start,end} that need to be processed. +void OpenCilkABI::LowerTapirRTCalls(Function &F, BasicBlock *TFEntry) { + Instruction *SF = cast(GetOrCreateCilkStackFrame(F)); + for (IntrinsicInst *II : TapirRTCalls[TFEntry]) { + IRBuilder<> Builder(II); + if (Intrinsic::tapir_runtime_start == II->getIntrinsicID()) { + // Lower calls to tapir.runtime.start to __cilkrts_enter_frame. + Builder.CreateCall(CILKRTS_FUNC(enter_frame), {SF}); + + // Find all tapir.runtime.ends that use this tapir.runtime.start, and + // lower them to calls to __cilk_parent_epilogue. + for (Use &U : II->uses()) + if (IntrinsicInst *UII = dyn_cast(U.getUser())) + if (Intrinsic::tapir_runtime_end == UII->getIntrinsicID()) { + Builder.SetInsertPoint(UII); + Builder.CreateCall(GetCilkParentEpilogueFn(), {SF}); + } + } + } +} + +void OpenCilkABI::MarkSpawner(Function &F) { + // If the spawner F might throw, then we mark F with the Cilk personality + // function, which ensures that the Cilk stack frame of F is properly unwound. + if (!F.doesNotThrow()) { + LLVMContext &C = M.getContext(); + // Get the type of the Cilk personality function the same way that clang and + // EscapeEnumerator get the type of a personality function. + Function *Personality = cast( + M.getOrInsertFunction("__cilk_personality_v0", + FunctionType::get(Type::getInt32Ty(C), true)) + .getCallee()); + F.setPersonalityFn(Personality); + } + + // Mark this function as stealable. + F.addFnAttr(Attribute::Stealable); + F.setMemoryEffects( + MemoryEffects(MemoryEffects::Location::Other, ModRefInfo::ModRef)); +} + +/// Lower a call to get the grainsize of a Tapir loop. +Value *OpenCilkABI::lowerGrainsizeCall(CallInst *GrainsizeCall) { + Value *Limit = GrainsizeCall->getArgOperand(0); + IRBuilder<> Builder(GrainsizeCall); + + // Select the appropriate __cilkrts_grainsize function, based on the type. + FunctionCallee CilkRTSGrainsizeCall; + if (GrainsizeCall->getType()->isIntegerTy(8)) + CilkRTSGrainsizeCall = CILKRTS_FUNC(cilk_for_grainsize_8); + else if (GrainsizeCall->getType()->isIntegerTy(16)) + CilkRTSGrainsizeCall = CILKRTS_FUNC(cilk_for_grainsize_16); + else if (GrainsizeCall->getType()->isIntegerTy(32)) + CilkRTSGrainsizeCall = CILKRTS_FUNC(cilk_for_grainsize_32); + else if (GrainsizeCall->getType()->isIntegerTy(64)) + CilkRTSGrainsizeCall = CILKRTS_FUNC(cilk_for_grainsize_64); + else + llvm_unreachable("No CilkRTSGrainsize call matches type for Tapir loop."); + + Value *Grainsize = Builder.CreateCall(CilkRTSGrainsizeCall, Limit); + + // Replace uses of grainsize intrinsic call with this grainsize value. + GrainsizeCall->replaceAllUsesWith(Grainsize); + return Grainsize; +} + +BasicBlock *OpenCilkABI::GetDefaultSyncLandingpad(Function &F, Value *SF, + DebugLoc Loc) { + // Return an existing default sync landingpad, if there is one. + if (DefaultSyncLandingpad.count(&F)) + return cast(DefaultSyncLandingpad[&F]); + + // Create a default cleanup landingpad block. + LLVMContext &C = F.getContext(); + const Twine Name = "default_sync_lpad"; + BasicBlock *CleanupBB = BasicBlock::Create(C, Name, &F); + Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C)); + + IRBuilder<> Builder(CleanupBB); + Builder.SetCurrentDebugLocation(Loc); + LandingPadInst *LPad = Builder.CreateLandingPad(ExnTy, 1, Name + ".lpad"); + LPad->setCleanup(true); + // Insert a call to __cilkrts_enter_landingpad. + Value *Sel = Builder.CreateExtractValue(LPad, {1}, "sel"); + Value *CilkLPadArgs[] = {SF, Sel}; + Builder.CreateCall(CILKRTS_FUNC(enter_landingpad), CilkLPadArgs, ""); + // Insert a resume. + Builder.CreateResume(LPad); + + DefaultSyncLandingpad[&F] = CleanupBB; + + return CleanupBB; +} + +// Lower a sync instruction SI. +void OpenCilkABI::lowerSync(SyncInst &SI) { + Function &Fn = *SI.getFunction(); + if (!DetachCtxToStackFrame[&Fn]) + // If we have not created a stackframe for this function, then we don't need + // to handle the sync. + return; + + Value *SF = GetOrCreateCilkStackFrame(Fn); + Value *Args[] = { SF }; + assert(Args[0] && "sync used in function without frame!"); + + Instruction *SyncUnwind = nullptr; + BasicBlock *SyncCont = SI.getSuccessor(0); + BasicBlock *SyncUnwindDest = nullptr; + // Determine whether a sync.unwind immediately follows SI. + if (InvokeInst *II = + dyn_cast(SyncCont->getFirstNonPHIOrDbgOrLifetime())) { + if (isSyncUnwind(II)) { + SyncUnwind = II; + SyncCont = II->getNormalDest(); + SyncUnwindDest = II->getUnwindDest(); + } + } else if (CallBase *CB = dyn_cast( + SyncCont->getFirstNonPHIOrDbgOrLifetime())) { + if (isSyncUnwind(CB)) + SyncUnwind = CB; + } + + CallBase *CB; + if (!SyncUnwindDest) { + if (Fn.doesNotThrow()) { + // This function doesn't throw any exceptions, so use the no-throw version + // of cilk_sync. + CB = CallInst::Create(GetCilkSyncNoThrowFn(), Args, "", + /*insert before*/ &SI); + BranchInst::Create(SyncCont, CB->getParent()); + } else if (SyncUnwind) { + // The presence of the sync.unwind indicates that the sync might rethrow + // an exception, but there isn't a landingpad associated with the sync. + + // Get the default sync-landingpad block to use instead, creating it if + // necessary. + BasicBlock *DefaultSyncLandingpad = + GetDefaultSyncLandingpad(Fn, SF, SI.getDebugLoc()); + + // Invoke __cilk_sync, using DefaultSyncLandingpad as the unwind + // destination. + CB = InvokeInst::Create(GetCilkSyncFn(), SyncCont, DefaultSyncLandingpad, + Args, "", + /*insert before*/ &SI); + } else { + // TODO: This case shouldn't be reachable. Check whether it is reachable. + CB = CallInst::Create(GetCilkSyncFn(), Args, "", /*insert before*/ &SI); + BranchInst::Create(SyncCont, CB->getParent()); + } + } else { + CB = InvokeInst::Create(GetCilkSyncFn(), SyncCont, SyncUnwindDest, Args, "", + /*insert before*/ &SI); + for (PHINode &PN : SyncCont->phis()) + PN.addIncoming(PN.getIncomingValueForBlock(SyncUnwind->getParent()), + SI.getParent()); + for (PHINode &PN : SyncUnwindDest->phis()) + PN.addIncoming(PN.getIncomingValueForBlock(SyncUnwind->getParent()), + SI.getParent()); + } + CB->setDebugLoc(SI.getDebugLoc()); + SI.eraseFromParent(); + + // Remember to inline this call later. + CallsToInline.insert(CB); + + // Mark this function as stealable. + Fn.addFnAttr(Attribute::Stealable); +} + +void OpenCilkABI::preProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, + bool IsSpawner, BasicBlock *TFEntry) { + // If the outlined task F itself performs spawns, set up F to support stealing + // continuations. + if (IsSpawner) + MarkSpawner(F); + + CallInst *EnterFrame = + InsertStackFramePush(F, TaskFrameCreate, /*Helper*/ true); + InsertDetach(F, (DetachPt ? DetachPt : &*(++EnterFrame->getIterator()))); +} + +void OpenCilkABI::postProcessOutlinedTask(Function &F, Instruction *DetachPt, + Instruction *TaskFrameCreate, + bool IsSpawner, BasicBlock *TFEntry) { + // Because F is a spawned task, we want to insert landingpads for all calls + // that can throw, so we can pop the stackframe correctly if they do throw. + // In particular, popping the stackframe of a spawned task may discover that + // the parent was stolen, in which case we want to save the exception for + // later reduction. + InsertStackFramePop(F, /*PromoteCallsToInvokes*/ true, + /*InsertPauseFrame*/ true, /*Helper*/ true); + + // TODO: If F is itself a spawner, see if we need to ensure that the Cilk + // personality function does not pop an already-popped frame. We might be + // able to do this by checking if sf->call_parent == NULL before performing a + // pop in the personality function. +} + +void OpenCilkABI::preProcessRootSpawner(Function &F, BasicBlock *TFEntry) { + MarkSpawner(F); + if (TapirRTCalls[TFEntry].empty()) { + InsertStackFramePush(F); + } else { + LowerTapirRTCalls(F, TFEntry); + } + Value *SF = DetachCtxToStackFrame[&F]; + for (BasicBlock &BB : F) { + if (BB.isLandingPad()) { + LandingPadInst *LPad = BB.getLandingPadInst(); + Instruction *InsertPt = &*BB.getFirstInsertionPt(); + IRBuilder<> Builder(InsertPt); + // Try to find debug information for the ABI call. First check the + // landing pad. + if (!Builder.getCurrentDebugLocation()) + Builder.SetCurrentDebugLocation(LPad->getDebugLoc()); + // Next, check later in the block + if (!Builder.getCurrentDebugLocation()) { + BasicBlock::iterator BI = Builder.GetInsertPoint(); + BasicBlock::const_iterator BE(Builder.GetInsertBlock()->end()); + while (BI != BE) { + if (DebugLoc Loc = BI->getDebugLoc()) { + Builder.SetCurrentDebugLocation(Loc); + break; + } + ++BI; + } + } + + Value *Sel = Builder.CreateExtractValue(LPad, 1, "sel"); + Builder.CreateCall(CILKRTS_FUNC(enter_landingpad), {SF, Sel}); + } + } +} + +void OpenCilkABI::postProcessRootSpawner(Function &F, BasicBlock *TFEntry) { + // F is a root spawner, not itself a spawned task. We don't need to promote + // calls to invokes, since the Cilk personality function will take care of + // popping the frame if no landingpad exists for a given call. + if (TapirRTCalls[TFEntry].empty()) + InsertStackFramePop(F, /*PromoteCallsToInvokes*/ false, + /*InsertPauseFrame*/ false, /*Helper*/ false); +} + +void OpenCilkABI::processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) { + Instruction *ReplStart = TOI.ReplStart; + Instruction *ReplCall = TOI.ReplCall; + + Function &F = *ReplCall->getFunction(); + Value *SF = DetachCtxToStackFrame[&F]; + assert(SF && "No frame found for spawning task"); + + // Split the basic block containing the detach replacement just before the + // start of the detach-replacement instructions. + BasicBlock *DetBlock = ReplStart->getParent(); + BasicBlock *CallBlock = SplitBlock(DetBlock, ReplStart, &DT); + + // Emit a __cilk_spawn_prepare at the end of the block preceding the split-off + // detach replacement. + Instruction *SpawnPt = DetBlock->getTerminator(); + IRBuilder<> B(SpawnPt); + CallBase *SpawnPrepCall = B.CreateCall(GetCilkPrepareSpawnFn(), {SF}); + + // Remember to inline this call later. + CallsToInline.insert(SpawnPrepCall); + + // Get the ordinary continuation of the detach. + BasicBlock *CallCont; + if (InvokeInst *II = dyn_cast(ReplCall)) + CallCont = II->getNormalDest(); + else // isa(CallSite) + CallCont = CallBlock->getSingleSuccessor(); + + // Insert a conditional branch, based on the result of the + // __cilk_spawn_prepare, to either the detach replacement or the continuation. + Value *SpawnPrepRes = B.CreateICmpEQ( + SpawnPrepCall, ConstantInt::get(SpawnPrepCall->getType(), 0)); + B.CreateCondBr(SpawnPrepRes, CallBlock, CallCont); + for (PHINode &PN : CallCont->phis()) + PN.addIncoming(PN.getIncomingValueForBlock(CallBlock), DetBlock); + + SpawnPt->eraseFromParent(); +} + +// Helper function to inline calls to compiler-generated Cilk runtime functions +// when possible. This inlining is necessary to properly implement some Cilk +// runtime "calls," such as __cilk_sync(). +static inline void inlineCilkFunctions( + Function &F, SmallPtrSetImpl &CallsToInline) { + for (CallBase *CB : CallsToInline) { + InlineFunctionInfo IFI; + InlineFunction(*CB, IFI); + } + CallsToInline.clear(); +} + +// For the taskframe at \p TFEntry containing blocks \p TFBlocks, find all +// outermost tapir.runtime.{start,end} intrinsics, which are not enclosed +// between other tapir.runtime.{start,end} intrinsics in this traksframe. +// Furthermore, record and successor taskframes in \p SuccessorTFs that are not +// enclosed between tapir.runtime.{start,end} intrinsics. +static bool findOutermostTapirRTCallsForTaskFrame( + SmallVectorImpl &TapirRTCalls, BasicBlock *TFEntry, + SmallPtrSetImpl &TFBlocks, + SmallPtrSetImpl &SuccessorTFs, TaskInfo &TI) { + SmallVector Worklist; + SmallPtrSet Visited; + Worklist.push_back(TFEntry->begin()); + + while (!Worklist.empty()) { + BasicBlock::iterator Iter = Worklist.pop_back_val(); + BasicBlock *BB = Iter->getParent(); + + bool FoundTapirRTStart = false; + bool FoundTapirRTEnd = false; + SmallVector EndIters; + // Scan the BB for tapir_runtime calls. + for (BasicBlock::iterator It = Iter, E = BB->end(); It != E; ++It) { + Instruction *I = &*It; + if (isTapirIntrinsic(Intrinsic::tapir_runtime_start, I)) { + FoundTapirRTStart = true; + TapirRTCalls.push_back(cast(I)); + // Examine corresponding tapir_runtime_end intrinsics to find blocks + // from which to continue search. + for (Use &U : I->uses()) { + if (Instruction *UI = dyn_cast(U.getUser())) { + FoundTapirRTEnd = true; + BasicBlock *EndBB = UI->getParent(); + assert(TFBlocks.count(EndBB) && "tapir_runtime_end not in same " + "taskframe as tapir_runtime_begin"); + EndIters.push_back(++UI->getIterator()); + } + } + + if (FoundTapirRTEnd) + // We found a tapir_runtime_begin in this block, so stop searching. + break; + } + } + + // If we didn't find a tapir_runtime_start in this block, treat this block + // as an end block, so we examine its direct successors. + if (!FoundTapirRTStart) + EndIters.push_back(BB->getTerminator()->getIterator()); + + // Examine all end blocks to 1) check if a spawn occurs, and 2) add + // successors within the taskframe for further search. + for (BasicBlock::iterator Iter : EndIters) { + if (isa(*Iter)) { + // We found a spawn terminating a block in this taskframe. This spawn + // is not contained between outermost tapir_runtime_{start,end} calls in + // the taskframe. Therefore, we should fall back to default behavior + // for inserting enter_frame and leave_frame calls for this taskframe. + TapirRTCalls.clear(); + return true; + } + + BasicBlock *EndBB = Iter->getParent(); + if (EndBB->getTerminator() != &*Iter) { + Worklist.push_back(Iter); + continue; + } + + // Add the successors of this block for further search. + for (BasicBlock *Succ : successors(EndBB)) { + if (TFBlocks.count(Succ) && Visited.insert(Succ).second) + // For successors within the taskframe, add them to the search. + Worklist.push_back(Succ->begin()); + else { + // For successors in other taskframes, add the subtaskframe for + // processing. + Spindle *SuccSpindle = TI.getSpindleFor(Succ); + if (SuccSpindle->getTaskFrameCreate()) + SuccessorTFs.insert(SuccSpindle); + } + } + } + } + + return false; +} + +// Find all tapir.runtime.{start,end} intrinsics to process for the taskframe +// rooted at spindle \p TaskFrame and any subtaskframes thereof. +void OpenCilkABI::GetTapirRTCalls(Spindle *TaskFrame, bool IsRootTask, + TaskInfo &TI) { + BasicBlock *TFEntry = TaskFrame->getEntry(); + SmallPtrSet TFBlocks; + SmallVector SubTFs; + if (IsRootTask) { + // We have to compute the effective taskframe blocks for the root task, + // since these blocks are not automatically identified by TapirTaskInfo. + // + // Note: We could generalize TapirTaskInfo to compute these taskframe blocks + // directly, but this computation seems to be the only place that set of + // blocks is needed. + SmallPtrSet ExcludedSpindles; + // Exclude all spindles in unassociated taskframes under the root task. + for (Spindle *TFRoot : TI.getRootTask()->taskframe_roots()) { + if (!TFRoot->getTaskFromTaskFrame()) + SubTFs.push_back(TFRoot); + for (Spindle *TFSpindle : depth_first>(TFRoot)) { + if (TFSpindle->getTaskFromTaskFrame()) + continue; + + for (Spindle *S : TFSpindle->taskframe_spindles()) + ExcludedSpindles.insert(S); + } + } + + // Iterate over the spindles in the root task, and add all spindle blocks to + // TFBlocks as long as those blocks don't belong to a nested taskframe. + for (Spindle *S : + depth_first>(TI.getRootTask()->getEntrySpindle())) { + if (ExcludedSpindles.count(S)) + continue; + + TFBlocks.insert(S->block_begin(), S->block_end()); + } + } else { + // Add all blocks in all spindles associated with this taskframe. + for (Spindle *S : TaskFrame->taskframe_spindles()) + TFBlocks.insert(S->block_begin(), S->block_end()); + + for (Spindle *SubTF : TaskFrame->subtaskframes()) + if (!SubTF->getTaskFromTaskFrame()) + SubTFs.push_back(SubTF); + } + + // Find the outermost tapir_runtime_{start,end} calls in this taskframe. + // Record in SuccessorTFs any subtaskframes that are not enclosed in + // tapir.runtime.{start,end} intrinsics. + SmallPtrSet SuccessorTFs; + bool TaskFrameSpawns = findOutermostTapirRTCallsForTaskFrame( + TapirRTCalls[TFEntry], TFEntry, TFBlocks, SuccessorTFs, TI); + + // If this taskframe spawns outside of tapir_runtime_{start,end} pairs, then + // the taskframe will start/end the runtime when executed. Hence there's no + // need to evaluate subtaskframes. + if (TaskFrameSpawns) + return; + + // Process subtaskframes recursively. + for (Spindle *SubTF : SubTFs) { + // Skip any subtaskframes that are already enclosed in + // tapir.runtime.{start,end} intrinsics. + if (!SuccessorTFs.count(SubTF)) + continue; + + // Skip any taskframes that are associated with subtasks. + assert(!SubTF->getTaskFromTaskFrame() && + "Should not be processing spawned taskframes."); + + GetTapirRTCalls(SubTF, false, TI); + } +} + +bool OpenCilkABI::preProcessFunction(Function &F, TaskInfo &TI, + bool ProcessingTapirLoops) { + if (ProcessingTapirLoops) + // Don't do any preprocessing when outlining Tapir loops. + return false; + + // Find all Tapir-runtime calls in this function that may be translated to + // enter_frame/leave_frame calls. + GetTapirRTCalls(TI.getRootTask()->getEntrySpindle(), true, TI); + + if (!TI.isSerial() || TapirRTCalls[&F.getEntryBlock()].empty()) + return false; + + MarkSpawner(F); + LowerTapirRTCalls(F, &F.getEntryBlock()); + return false; +} + +void OpenCilkABI::postProcessFunction(Function &F, bool ProcessingTapirLoops) { + if (ProcessingTapirLoops) + // Don't do any postprocessing when outlining Tapir loops. + return; + + if (!DebugABICalls) + inlineCilkFunctions(F, CallsToInline); +} + +/// Process the Tapir instructions in an ordinary (non-spawning and not spawned) +/// function \p F directly. +bool OpenCilkABI::processOrdinaryFunction(Function &F, BasicBlock *TFEntry) { + // Get the simple Tapir instructions to process, including syncs and + // loop-grainsize calls. + SmallVector GrainsizeCalls; + SmallVector TaskFrameAddrCalls; + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + // Record calls to get Tapir-loop grainsizes. + if (IntrinsicInst *II = dyn_cast(&I)) + if (Intrinsic::tapir_loop_grainsize == II->getIntrinsicID()) + GrainsizeCalls.push_back(II); + + // Record calls to task_frameaddr intrinsics. + if (IntrinsicInst *II = dyn_cast(&I)) + if (Intrinsic::task_frameaddress == II->getIntrinsicID()) + TaskFrameAddrCalls.push_back(II); + } + } + + // Lower simple Tapir instructions in this function. Collect the set of + // helper functions generated by this process. + bool Changed = false; + + // Lower calls to get Tapir-loop grainsizes. + while (!GrainsizeCalls.empty()) { + CallInst *GrainsizeCall = GrainsizeCalls.pop_back_val(); + LLVM_DEBUG(dbgs() << "Lowering grainsize call " << *GrainsizeCall << "\n"); + lowerGrainsizeCall(GrainsizeCall); + Changed = true; + } + + // Lower calls to task_frameaddr intrinsics. + while (!TaskFrameAddrCalls.empty()) { + CallInst *TaskFrameAddrCall = TaskFrameAddrCalls.pop_back_val(); + LLVM_DEBUG(dbgs() << "Lowering task_frameaddr call " << *TaskFrameAddrCall + << "\n"); + lowerTaskFrameAddrCall(TaskFrameAddrCall); + Changed = true; + } + + // If any calls to tapir.runtime.{start,end} were found in this taskframe that + // need processing, lower them now. + if (!TapirRTCalls[TFEntry].empty()) { + LowerTapirRTCalls(F, TFEntry); + Changed = true; + } + + return Changed; +} + +void OpenCilkABI::postProcessHelper(Function &F) {} + +LoopOutlineProcessor *OpenCilkABI::getLoopOutlineProcessor( + const TapirLoopInfo *TL) const { + return nullptr; +} + +void OpenCilkABI::lowerReducerOperation(CallBase *CI) { + FunctionCallee Fn = nullptr; + const Function *Called = CI->getCalledFunction(); + assert(Called); + Intrinsic::ID ID = Called->getIntrinsicID(); + switch (ID) { + default: + llvm_unreachable("unexpected reducer intrinsic"); + } + CI->setCalledFunction(Fn); +} diff --git a/llvm/lib/Transforms/Tapir/Outline.cpp b/llvm/lib/Transforms/Tapir/Outline.cpp new file mode 100644 index 000000000000000..952a72f82be7b07 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/Outline.cpp @@ -0,0 +1,567 @@ +//===- TapirOutline.cpp - Outlining for Tapir -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements helper functions for outlining portions of code +// containing Tapir instructions. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/IR/AttributeMask.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/ModRef.h" +#include "llvm/Support/Timer.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "outlining" + +static const char TimerGroupName[] = DEBUG_TYPE; +static const char TimerGroupDescription[] = "Tapir outlining"; + +// Materialize any necessary information in DstM when outlining Tapir into DstM. +Value *OutlineMaterializer::materialize(Value *V) { + if (V == SrcSyncRegion) { + // Create a new sync region to replace the sync region SrcSyncRegion from + // the source. + + // Get the destination function + User *U = *(V->materialized_user_begin()); + Function *DstFunc = cast(U)->getFunction(); + // Add a new syncregion to the entry block of the destination function + Instruction *NewSyncReg = cast(SrcSyncRegion)->clone(); + BasicBlock *EntryBlock = &DstFunc->getEntryBlock(); + NewSyncReg->insertInto(EntryBlock, EntryBlock->end()); + // Record the entry block as needing remapping + BlocksToRemap.insert(EntryBlock); + return NewSyncReg; + } + + return nullptr; +} + +/// Clone Blocks into NewFunc, transforming the old arguments into references to +/// VMap values. +/// +/// This logic is based on CloneFunctionInto, defined in +/// Transforms/Utils/CloneFunction, but with additional functionality specific +/// to Tapir outlining. +void llvm::CloneIntoFunction(Function *NewFunc, const Function *OldFunc, + std::vector Blocks, + ValueToValueMapTy &VMap, bool ModuleLevelChanges, + SmallVectorImpl &Returns, + const StringRef NameSuffix, + SmallPtrSetImpl *ReattachBlocks, + SmallPtrSetImpl *TaskResumeBlocks, + SmallPtrSetImpl *SharedEHEntries, + DISubprogram *SP, ClonedCodeInfo *CodeInfo, + ValueMapTypeRemapper *TypeMapper, + OutlineMaterializer *Materializer) { + // Get the predecessors of the exit blocks + SmallPtrSet EHEntryPreds, ClonedEHEntryPreds; + if (SharedEHEntries) + for (BasicBlock *EHEntry : *SharedEHEntries) + for (BasicBlock *Pred : predecessors(EHEntry)) + EHEntryPreds.insert(Pred); + + // When we remap instructions, we want to avoid duplicating inlined + // DISubprograms, so record all subprograms we find as we duplicate + // instructions and then freeze them in the MD map. + // We also record information about dbg.value and dbg.declare to avoid + // duplicating the types. + DebugInfoFinder DIFinder; + + // Loop over all of the basic blocks in the function, cloning them as + // appropriate. + { + NamedRegionTimer NRT("CloneBlocks", "Clone basic blocks", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + for (const BasicBlock *BB : Blocks) { + // Record all exit block predecessors that are cloned. + if (EHEntryPreds.count(BB)) + ClonedEHEntryPreds.insert(BB); + + // Create a new basic block and copy instructions into it! + BasicBlock *CBB = CloneBasicBlock(BB, VMap, NameSuffix, NewFunc, CodeInfo, + SP ? &DIFinder : nullptr); + + // Add basic block mapping. + VMap[BB] = CBB; + + // It is only legal to clone a function if a block address within that + // function is never referenced outside of the function. Given that, we + // want to map block addresses from the old function to block addresses in + // the clone. (This is different from the generic ValueMapper + // implementation, which generates an invalid blockaddress when cloning a + // function.) + if (BB->hasAddressTaken()) { + Constant *OldBBAddr = BlockAddress::get(const_cast(OldFunc), + const_cast(BB)); + VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB); + } + + // Note return instructions for the caller. + if (ReturnInst *RI = dyn_cast(CBB->getTerminator())) + Returns.push_back(RI); + } + } // end timed region + + // For each exit block, clean up its phi nodes to exclude predecessors that + // were not cloned. Also replace detached_rethrow invokes with resumes. + if (SharedEHEntries) { + NamedRegionTimer NRT("FixupSharedEH", "Fixup shared EH blocks", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + for (BasicBlock *EHEntry : *SharedEHEntries) { + if (!VMap[EHEntry]) + continue; + + // Get the predecessors of this exit block that were not cloned. + SmallVector PredNotCloned; + for (BasicBlock *Pred : predecessors(EHEntry)) + if (!ClonedEHEntryPreds.count(Pred)) + PredNotCloned.push_back(Pred); + + // Iterate over the phi nodes in the cloned exit block and remove incoming + // values from predecessors that were not cloned. + BasicBlock *ClonedEHEntry = cast(VMap[EHEntry]); + BasicBlock::iterator BI = ClonedEHEntry->begin(); + while (PHINode *PN = dyn_cast(BI)) { + for (BasicBlock *DeadPred : PredNotCloned) + if (PN->getBasicBlockIndex(DeadPred) > -1) + PN->removeIncomingValue(DeadPred); + ++BI; + } + } + } + if (ReattachBlocks) { + NamedRegionTimer NRT("FixupReattach", "Fixup reattach blocks", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + for (BasicBlock *ReattachBlk : *ReattachBlocks) { + BasicBlock *ClonedRB = cast(VMap[ReattachBlk]); + // Don't get the remapped name of this successor yet. Subsequent + // remapping will take correct the name. + BasicBlock *Succ = ClonedRB->getSingleSuccessor(); + ReplaceInstWithInst(ClonedRB->getTerminator(), + BranchInst::Create(Succ)); + } + } + if (TaskResumeBlocks) { + NamedRegionTimer NRT("FixupTaskResume", "Fixup task-resume blocks", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + for (BasicBlock *TaskResumeBlk : *TaskResumeBlocks) { + // Skip blocks that are not terminated by a detached.rethrow or + // taskframe.resume. + if (!isDetachedRethrow(TaskResumeBlk->getTerminator()) && + !isTaskFrameResume(TaskResumeBlk->getTerminator())) + continue; + + BasicBlock *ClonedDRB = cast(VMap[TaskResumeBlk]); + // If this exit block terminates in a detached.rethrow or + // taskframe.resume, replace the terminator with a resume. + InvokeInst *II = cast(ClonedDRB->getTerminator()); + Value *RethrowArg = II->getArgOperand(1); + ReplaceInstWithInst(ClonedDRB->getTerminator(), + ResumeInst::Create(RethrowArg)); + } + } + + { + NamedRegionTimer NRT("MapMetadata", "Map function metadata", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + for (DISubprogram *ISP : DIFinder.subprograms()) + if (ISP != SP) + VMap.MD()[ISP].reset(ISP); + + for (DICompileUnit *CU : DIFinder.compile_units()) + VMap.MD()[CU].reset(CU); + + for (DIType *Type : DIFinder.types()) + VMap.MD()[Type].reset(Type); + + // Duplicate the metadata that is attached to the cloned function. + // Subprograms/CUs/types that were already mapped to themselves won't be + // duplicated. + SmallVector, 1> MDs; + OldFunc->getAllMetadata(MDs); + for (auto MD : MDs) { + NewFunc->addMetadata( + MD.first, + *MapMetadata(MD.second, VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer)); + } + } // end timed region + + // Loop over all of the instructions in the function, fixing up operand + // references as we go. This uses VMap to do all the hard work. + { + NamedRegionTimer NRT("RemapBlock", "Remap instructions in block", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + for (const BasicBlock *BB : Blocks) { + BasicBlock *CBB = cast(VMap[BB]); + LLVM_DEBUG(dbgs() << "In block " << CBB->getName() << "\n"); + // Loop over all instructions, fixing each one as we find it... + for (Instruction &II : *CBB) { + LLVM_DEBUG(dbgs() << " Remapping " << II << "\n"); + RemapInstruction(&II, VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer); + } + } + + // Remapping instructions could cause the Materializer to insert new + // instructions in the entry block. Now remap the instructions in the entry + // block. + if (Materializer) + while (!Materializer->BlocksToRemap.empty()) { + BasicBlock *BB = Materializer->BlocksToRemap.pop_back_val(); + for (Instruction &II : *BB) { + LLVM_DEBUG(dbgs() << " Remapping " << II << "\n"); + RemapInstruction(&II, VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer); + } + } + } // end timed region + + // Register all DICompileUnits of the old parent module in the new parent + // module + auto *OldModule = OldFunc->getParent(); + auto *NewModule = NewFunc->getParent(); + if (OldModule && NewModule && OldModule != NewModule && + DIFinder.compile_unit_count()) { + auto *NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu"); + // Avoid multiple insertions of the same DICompileUnit to NMD. + SmallPtrSet Visited; + for (auto *Operand : NMD->operands()) + Visited.insert(Operand); + for (auto *Unit : DIFinder.compile_units()) + // VMap.MD()[Unit] == Unit + if (Visited.insert(Unit).second) + NMD->addOperand(Unit); + } +} + +/// Create a helper function whose signature is based on Inputs and +/// Outputs as follows: f(in0, ..., inN, out0, ..., outN) +/// +/// This logic is based on CloneFunctionInto, defined in +/// Transforms/Utils/CloneFunction, but with additional functionality specific +/// to Tapir outlining. +Function *llvm::CreateHelper( + const ValueSet &Inputs, const ValueSet &Outputs, + std::vector Blocks, BasicBlock *Header, + const BasicBlock *OldEntry, const BasicBlock *OldExit, + ValueToValueMapTy &VMap, Module *DestM, bool ModuleLevelChanges, + SmallVectorImpl &Returns, const StringRef NameSuffix, + SmallPtrSetImpl *ReattachBlocks, + SmallPtrSetImpl *DetachRethrowBlocks, + SmallPtrSetImpl *SharedEHEntries, + const BasicBlock *OldUnwind, + SmallPtrSetImpl *UnreachableExits, + Type *ReturnType, ClonedCodeInfo *CodeInfo, + ValueMapTypeRemapper *TypeMapper, OutlineMaterializer *Materializer) { + LLVM_DEBUG(dbgs() << "inputs: " << Inputs.size() << "\n"); + LLVM_DEBUG(dbgs() << "outputs: " << Outputs.size() << "\n"); + + Function *OldFunc = Header->getParent(); + Type *RetTy = ReturnType; + bool VoidRet = false; + if (!RetTy) + RetTy = Type::getVoidTy(Header->getContext()); + if (Type::getVoidTy(Header->getContext()) == RetTy) + VoidRet = true; + + std::vector paramTy; + + // Add the types of the input values to the function's argument list + for (Value *value : Inputs) { + LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n"); + paramTy.push_back(value->getType()); + } + + // Add the types of the output values to the function's argument list. + for (Value *output : Outputs) { + LLVM_DEBUG(dbgs() << "instr used in func: " << *output << "\n"); + paramTy.push_back(PointerType::getUnqual(output->getType())); + } + + LLVM_DEBUG({ + dbgs() << "Function type: " << *RetTy << " f("; + for (Type *i : paramTy) + dbgs() << *i << ", "; + dbgs() << ")\n"; + }); + + FunctionType *FTy = FunctionType::get(RetTy, paramTy, false); + + // Create the new function + Function *NewFunc = Function::Create( + FTy, OldFunc->getLinkage(), + OldFunc->getName() + ".outline_" + Header->getName() + NameSuffix, DestM); + + // Set names for input and output arguments. At the same time, analyze + // notable arguments, such as vector arguments. + bool VectorArg = false; + uint64_t MaxVectorArgWidth = 0; + Function::arg_iterator DestI = NewFunc->arg_begin(); + for (Value *I : Inputs) { + if (VMap.count(I) == 0) { // Is this argument preserved? + DestI->setName(I->getName()+NameSuffix); // Copy the name over... + VMap[I] = &*DestI++; // Add mapping to VMap + } + // Check for any vector arguments, and record the maximum width of any + // vector argument we find. + if (VectorType *VT = dyn_cast(I->getType())) { + VectorArg = true; + ElementCount EC = VT->getElementCount(); + if (EC.isScalable()) + // If we have a scalable vector, give up. + MaxVectorArgWidth = std::numeric_limits::max(); + else { + unsigned VectorArgWidth = + EC.getKnownMinValue() * VT->getScalarSizeInBits(); + if (MaxVectorArgWidth < VectorArgWidth) + MaxVectorArgWidth = VectorArgWidth; + } + } + } + for (Value *I : Outputs) + if (VMap.count(I) == 0) { // Is this argument preserved? + DestI->setName(I->getName()+NameSuffix); // Copy the name over... + VMap[I] = &*DestI++; // Add mapping to VMap + } + + // Copy all attributes other than those stored in the AttributeSet. We need + // to remap the parameter indices of the AttributeSet. + AttributeList NewAttrs = NewFunc->getAttributes(); + NewFunc->copyAttributesFrom(OldFunc); + NewFunc->setAttributes(NewAttrs); + + // Fix up the personality function that got copied over. + if (OldFunc->hasPersonalityFn()) + NewFunc->setPersonalityFn( + MapValue(OldFunc->getPersonalityFn(), VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer)); + + SmallVector NewArgAttrs(NewFunc->arg_size()); + AttributeList OldAttrs = OldFunc->getAttributes(); + + // Clone any argument attributes + { + NamedRegionTimer NRT("CloneArgAttrs", "Clone argument attributes", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + for (Argument &OldArg : OldFunc->args()) { + // Check if we're passing this argument to the helper. We check Inputs here + // instead of the VMap to avoid potentially populating the VMap with a null + // entry for the old argument. + if (Inputs.count(&OldArg) || Outputs.count(&OldArg)) { + Argument *NewArg = dyn_cast(VMap[&OldArg]); + NewArgAttrs[NewArg->getArgNo()] = + OldAttrs.getParamAttrs(OldArg.getArgNo()) + .removeAttribute(NewFunc->getContext(), Attribute::Returned); + } + } + } // end timed region + + NewFunc->setAttributes( + AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttrs(), + OldAttrs.getRetAttrs(), NewArgAttrs)); + + // Remove prologue data + if (NewFunc->hasPrologueData()) + NewFunc->setPrologueData(nullptr); + + // Remove old return attributes. + NewFunc->removeRetAttrs( + AttributeFuncs::typeIncompatible(NewFunc->getReturnType())); + + // Update vector-related attributes in the caller and new function + if (VectorArg && OldFunc->hasFnAttribute("min-legal-vector-width")) { + uint64_t CallerVectorWidth; + OldFunc->getFnAttribute("min-legal-vector-width") + .getValueAsString() + .getAsInteger(0, CallerVectorWidth); + if (std::numeric_limits::max() == MaxVectorArgWidth) { + // MaxVectorArgWidth is not a finite value. Give up and remove the + // min-legal-vector-width attribute, so OldFunc wil be treated + // conservatively henceforth. + OldFunc->removeFnAttr("min-legal-vector-width"); + // Update the min-legal-vector-width in the new function as well + NewFunc->removeFnAttr("min-legal-vector-width"); + } else if (MaxVectorArgWidth > CallerVectorWidth) { + // If MaxVectorArgWidth is a finite value and larger than the + // min-legal-vector-width of OldFunc, then set the + // min-legal-vector-width of OldFunc to match MaxVectorArgWidth. + OldFunc->addFnAttr("min-legal-vector-width", + llvm::utostr(MaxVectorArgWidth)); + // Update the min-legal-vector-width in the new function + NewFunc->addFnAttr("min-legal-vector-width", + llvm::utostr(MaxVectorArgWidth)); + } + } + + // Clone the metadata from the old function into the new. + bool MustCloneSP = OldFunc->getParent() && OldFunc->getParent() == DestM; + DISubprogram *SP = OldFunc->getSubprogram(); + if (SP) { + assert(!MustCloneSP || ModuleLevelChanges); + // Add mappings for some DebugInfo nodes that we don't want duplicated + // even if they're distinct. + auto &MD = VMap.MD(); + MD[SP->getUnit()].reset(SP->getUnit()); + MD[SP->getType()].reset(SP->getType()); + MD[SP->getFile()].reset(SP->getFile()); + // If we're not cloning into the same module, no need to clone the + // subprogram + if (!MustCloneSP) + MD[SP].reset(SP); + } + + // If the outlined function has pointer arguments its memory effects are + // unknown. Otherwise it inherits the memory effects of its parent. + // The caller can improve on this if desired. + for (Argument &Arg : NewFunc->args()) { + if (Arg.getType()->isPointerTy()) { + NewFunc->removeFnAttr(Attribute::Memory); + break; + } + } + + // Inherit the calling convention from the parent. + NewFunc->setCallingConv(OldFunc->getCallingConv()); + + // The new function needs a root node because other nodes can branch to the + // head of the region, but the entry node of a function cannot have preds. + BasicBlock *NewEntry = BasicBlock::Create( + Header->getContext(), OldEntry->getName()+NameSuffix, NewFunc); + // The new function also needs an exit node. + BasicBlock *NewExit = BasicBlock::Create( + Header->getContext(), OldExit->getName()+NameSuffix); + + // Add mappings to the NewEntry and NewExit. + VMap[OldEntry] = NewEntry; + VMap[OldExit] = NewExit; + + BasicBlock *NewUnwind = nullptr; + // Create a new unwind destination for the cloned blocks if it's needed. + if (OldUnwind) { + NewUnwind = BasicBlock::Create( + NewFunc->getContext(), OldUnwind->getName()+NameSuffix); + VMap[OldUnwind] = NewUnwind; + } + + // Create an new unreachable exit block, if needed. + BasicBlock *NewUnreachable = nullptr; + if (UnreachableExits && !UnreachableExits->empty()) { + NewUnreachable = BasicBlock::Create( + NewFunc->getContext(), "unreachable"+NameSuffix); + new UnreachableInst(NewFunc->getContext(), NewUnreachable); + for (BasicBlock *Unreachable : *UnreachableExits) + VMap[Unreachable] = NewUnreachable; + } + + // Clone Blocks into the new function. + CloneIntoFunction(NewFunc, OldFunc, Blocks, VMap, ModuleLevelChanges, + Returns, NameSuffix, ReattachBlocks, DetachRethrowBlocks, + SharedEHEntries, SP, CodeInfo, TypeMapper, Materializer); + + // Add a branch in the new function to the cloned Header. + BasicBlock *ClonedHeader = cast(VMap[Header]); + BranchInst *EntryBr = BranchInst::Create(ClonedHeader, NewEntry); + // Set the debug location of the entry branch to match the first debug + // location in the cloned header. + for (const Instruction &I : *ClonedHeader) + if (const DebugLoc &Loc = I.getDebugLoc()) { + EntryBr->setDebugLoc(Loc); + break; + } + + // Insert the new exit block, terminated by a return. + NewExit->insertInto(NewFunc); + // Add a return in the new function, with a default null value if necessary. + ReturnInst *NewRet; + if (VoidRet) + NewRet = ReturnInst::Create(Header->getContext(), NewExit); + else + NewRet = ReturnInst::Create(Header->getContext(), + Constant::getNullValue(RetTy), NewExit); + // Set the debug location of the ret to match the debug location of some + // corresponding reattach. + for (const BasicBlock *Pred : predecessors(NewExit)) + if (const DebugLoc &Loc = Pred->getTerminator()->getDebugLoc()) { + NewRet->setDebugLoc(Loc); + break; + } + + // If needed, create a landingpad and resume for the unwind destination in the + // new function. + if (OldUnwind) { + NewUnwind->insertInto(NewFunc); + LandingPadInst *LPad = + LandingPadInst::Create(OldUnwind->getLandingPadInst()->getType(), 0, + "lpadval", NewUnwind); + LPad->setCleanup(true); + ResumeInst *NewResume = ResumeInst::Create(LPad, NewUnwind); + // Set the debug location of the resume to match the debug location of some + // corresponding detached_rethrow. + for (const BasicBlock *Pred : predecessors(NewUnwind)) + if (const DebugLoc &Loc = Pred->getTerminator()->getDebugLoc()) { + NewResume->setDebugLoc(Loc); + break; + } + } + + // If needed, add the new unreachable destination. + if (NewUnreachable) + NewUnreachable->insertInto(NewFunc); + + return NewFunc; +} + +// Add alignment assumptions to parameters of outlined function, based on known +// alignment data in the caller. +void llvm::AddAlignmentAssumptions( + const Function *Caller, const ValueSet &Args, ValueToValueMapTy &VMap, + const Instruction *CallSite, AssumptionCache *AC, DominatorTree *DT) { + NamedRegionTimer NRT("AddAlignmentAssumptions", "Add alignment assumptions", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + auto &DL = Caller->getParent()->getDataLayout(); + for (Value *ArgVal : Args) { + // Ignore arguments to non-pointer types + if (!ArgVal->getType()->isPointerTy()) continue; + Argument *Arg = cast(VMap[ArgVal]); + // Ignore arguments to non-pointer types + if (!Arg->getType()->isPointerTy()) continue; + // If the argument already has an alignment attribute, skip it. + if (Arg->getParamAlign()) continue; + // Get any known alignment information for this argument's value. + Align Alignment = getKnownAlignment(ArgVal, DL, CallSite, AC, DT); + // If we have alignment data, add it as an attribute to the outlined + // function's parameter. + if (Alignment >= 1) + Arg->addAttr(Attribute::getWithAlignment(Arg->getContext(), Alignment)); + } +} diff --git a/llvm/lib/Transforms/Tapir/QthreadsABI.cpp b/llvm/lib/Transforms/Tapir/QthreadsABI.cpp new file mode 100644 index 000000000000000..3fc6c761b69a378 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/QthreadsABI.cpp @@ -0,0 +1,350 @@ +//===- QthreadsABI.cpp - Lower Tapir into Qthreads runtime system calls -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the QthreadsABI interface, which is used to convert +// Tapir instructions to calls into the Qthreads runtime system. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/QthreadsABI.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "qthreadsabi" + +static cl::opt UseCopyargs( + "qthreads-use-fork-copyargs", cl::init(false), cl::Hidden, + cl::desc("Use copyargs variant of fork")); + +// Accessors for opaque Qthreads RTS functions +FunctionCallee QthreadsABI::get_qthread_num_workers() { + if (QthreadNumWorkers) + return QthreadNumWorkers; + + LLVMContext &C = M.getContext(); + AttributeList AL; + // TODO: Set appropriate function attributes. + FunctionType *FTy = FunctionType::get(Type::getInt16Ty(C), {}, false); + QthreadNumWorkers = M.getOrInsertFunction("qthread_num_workers", FTy, AL); + return QthreadNumWorkers; +} + +FunctionCallee QthreadsABI::get_qthread_fork_copyargs() { + if (QthreadForkCopyargs) + return QthreadForkCopyargs; + + LLVMContext &C = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + AttributeList AL; + // TODO: Set appropriate function attributes. + FunctionType *FTy = FunctionType::get( + Type::getInt32Ty(C), + { QthreadFTy, // qthread_f f + Type::getInt8PtrTy(C), // const void *arg + DL.getIntPtrType(C), // size_t arg_size + Type::getInt64PtrTy(C) // aligned_t *ret + }, false); + + QthreadForkCopyargs = M.getOrInsertFunction("qthread_fork_copyargs", FTy, AL); + return QthreadForkCopyargs; +} + +FunctionCallee QthreadsABI::get_qthread_initialize() { + if (QthreadInitialize) + return QthreadInitialize; + + LLVMContext &C = M.getContext(); + AttributeList AL; + // TODO: Set appropriate function attributes. + FunctionType *FTy = FunctionType::get( + Type::getInt32Ty(C), {}, false); + + QthreadInitialize = M.getOrInsertFunction("qthread_initialize", FTy, AL); + return QthreadInitialize; +} + +FunctionCallee QthreadsABI::get_qt_sinc_create() { + if (QtSincCreate) + return QtSincCreate; + + LLVMContext &C = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + AttributeList AL; + // TODO: Set appropriate function attributes. + FunctionType *FTy = FunctionType::get( + Type::getInt8PtrTy(C), + { DL.getIntPtrType(C), // size_t size + Type::getInt8PtrTy(C), // void *initval + Type::getInt8PtrTy(C), // void *op + DL.getIntPtrType(C) // size_t expect + }, + false); + + QtSincCreate = M.getOrInsertFunction("qt_sinc_create", FTy, AL); + return QtSincCreate; +} + +FunctionCallee QthreadsABI::get_qt_sinc_expect() { + if (QtSincExpect) + return QtSincExpect; + + LLVMContext &C = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + AttributeList AL; + // TODO: Set appropriate function attributes. + FunctionType *FTy = FunctionType::get( + Type::getVoidTy(C), + { Type::getInt8PtrTy(C), // sync_t *s + DL.getIntPtrType(C) // size_t incr + }, + false); + + QtSincExpect = M.getOrInsertFunction("qt_sinc_expect", FTy, AL); + return QtSincExpect; +} + +FunctionCallee QthreadsABI::get_qt_sinc_submit() { + if (QtSincSubmit) + return QtSincSubmit; + + LLVMContext &C = M.getContext(); + AttributeList AL; + // TODO: Set appropriate function attributes. + FunctionType *FTy = FunctionType::get( + Type::getVoidTy(C), + { Type::getInt8PtrTy(C), // sync_t *s + Type::getInt8PtrTy(C) // void *val + }, + false); + + QtSincSubmit = M.getOrInsertFunction("qt_sinc_submit", FTy, AL); + return QtSincSubmit; +} + +FunctionCallee QthreadsABI::get_qt_sinc_wait() { + if (QtSincWait) + return QtSincWait; + + LLVMContext &C = M.getContext(); + AttributeList AL; + // TODO: Set appropriate function attributes. + FunctionType *FTy = FunctionType::get( + Type::getVoidTy(C), + { Type::getInt8PtrTy(C), // sync_t *s + Type::getInt8PtrTy(C) // void *target + }, + false); + + QtSincWait = M.getOrInsertFunction("qt_sinc_wait", FTy, AL); + return QtSincWait; +} + +FunctionCallee QthreadsABI::get_qt_sinc_destroy() { + if (QtSincDestroy) + return QtSincDestroy; + + LLVMContext &C = M.getContext(); + AttributeList AL; + // TODO: Set appropriate function attributes. + FunctionType *FTy = FunctionType::get( + Type::getVoidTy(C), + { Type::getInt8PtrTy(C), // sync_t *s + }, + false); + + QtSincDestroy = M.getOrInsertFunction("qt_sinc_destroy", FTy, AL); + return QtSincDestroy; +} + +#define QTHREAD_FUNC(name) get_##name() + +QthreadsABI::QthreadsABI(Module &M) : TapirTarget(M) { + LLVMContext &C = M.getContext(); + // Initialize any types we need for lowering. + QthreadFTy = PointerType::getUnqual( + FunctionType::get(Type::getInt64Ty(C), { Type::getInt8PtrTy(C) }, false)); +} + +/// Lower a call to get the grainsize of this Tapir loop. +/// +/// The grainsize is computed by the following equation: +/// +/// Grainsize = min(2048, ceil(Limit / (8 * workers))) +/// +/// This computation is inserted into the preheader of the loop. +Value *QthreadsABI::lowerGrainsizeCall(CallInst *GrainsizeCall) { + Value *Limit = GrainsizeCall->getArgOperand(0); + IRBuilder<> Builder(GrainsizeCall); + + // Get 8 * workers + Value *Workers = Builder.CreateCall(QTHREAD_FUNC(qthread_num_workers)); + Value *WorkersX8 = Builder.CreateIntCast( + Builder.CreateMul(Workers, ConstantInt::get(Workers->getType(), 8)), + Limit->getType(), false); + // Compute ceil(limit / 8 * workers) = + // (limit + 8 * workers - 1) / (8 * workers) + Value *SmallLoopVal = + Builder.CreateUDiv(Builder.CreateSub(Builder.CreateAdd(Limit, WorkersX8), + ConstantInt::get(Limit->getType(), 1)), + WorkersX8); + // Compute min + Value *LargeLoopVal = ConstantInt::get(Limit->getType(), 2048); + Value *Cmp = Builder.CreateICmpULT(LargeLoopVal, SmallLoopVal); + Value *Grainsize = Builder.CreateSelect(Cmp, LargeLoopVal, SmallLoopVal); + + // Replace uses of grainsize intrinsic call with this grainsize value. + GrainsizeCall->replaceAllUsesWith(Grainsize); + return Grainsize; +} + +Value *QthreadsABI::getOrCreateSinc(Value *SyncRegion, Function *F) { + LLVMContext &C = M.getContext(); + Value* sinc; + if((sinc = SyncRegionToSinc[SyncRegion])) + return sinc; + else { + Value* zero = ConstantInt::get(Type::getInt64Ty(C), 0); + Value* null = Constant::getNullValue(Type::getInt8PtrTy(C)); + std::vector createArgs = {zero, null, null, zero}; + sinc = CallInst::Create(QTHREAD_FUNC(qt_sinc_create), createArgs, "", + F->getEntryBlock().getTerminator()); + SyncRegionToSinc[SyncRegion] = sinc; + + // Make sure we destroy the sinc at all exit points to prevent memory leaks + for(BasicBlock &BB : *F) { + if(isa(BB.getTerminator())){ + CallInst::Create(QTHREAD_FUNC(qt_sinc_destroy), {sinc}, "", + BB.getTerminator()); + } + } + + return sinc; + } +} + +void QthreadsABI::lowerSync(SyncInst &SI) { + IRBuilder<> builder(&SI); + auto F = SI.getParent()->getParent(); + auto& C = M.getContext(); + auto null = Constant::getNullValue(Type::getInt8PtrTy(C)); + Value* SR = SI.getSyncRegion(); + auto sinc = getOrCreateSinc(SR, F); + std::vector args = {sinc, null}; + auto sincwait = QTHREAD_FUNC(qt_sinc_wait); + builder.CreateCall(sincwait, args); + BranchInst *PostSync = BranchInst::Create(SI.getSuccessor(0)); + ReplaceInstWithInst(&SI, PostSync); +} + +void QthreadsABI::processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) { + Function *Outlined = TOI.Outline; + Instruction *ReplStart = TOI.ReplStart; + CallBase *ReplCall = cast(TOI.ReplCall); + BasicBlock *CallBlock = ReplStart->getParent(); + + LLVMContext &C = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + + // At this point, we have a call in the parent to a function containing the + // task body. That function takes as its argument a pointer to a structure + // containing the inputs to the task body. This structure is initialized in + // the parent immediately before the call. + + // To match the Qthreads ABI, we replace the existing call with a call to + // qthreads_fork_copyargs. + IRBuilder<> CallerIRBuilder(ReplCall); + Value *OutlinedFnPtr = CallerIRBuilder.CreatePointerBitCastOrAddrSpaceCast( + Outlined, QthreadFTy); + AllocaInst *CallerArgStruct = cast(ReplCall->getArgOperand(0)); + Type *ArgsTy = CallerArgStruct->getAllocatedType(); + Value *ArgStructPtr = CallerIRBuilder.CreateBitCast(CallerArgStruct, + Type::getInt8PtrTy(C)); + Constant *Null = Constant::getNullValue(Type::getInt64PtrTy(C)); + ConstantInt *ArgSize = ConstantInt::get(DL.getIntPtrType(C), + DL.getTypeAllocSize(ArgsTy)); + CallInst *Call = CallerIRBuilder.CreateCall( + QTHREAD_FUNC(qthread_fork_copyargs), { OutlinedFnPtr, ArgStructPtr, + ArgSize, Null }); + Call->setDebugLoc(ReplCall->getDebugLoc()); + TOI.replaceReplCall(Call); + ReplCall->eraseFromParent(); + + // Add lifetime intrinsics for the argument struct. TODO: Move this logic + // into underlying LoweringUtils routines? + CallerIRBuilder.SetInsertPoint(ReplStart); + CallerIRBuilder.CreateLifetimeStart(CallerArgStruct, ArgSize); + CallerIRBuilder.SetInsertPoint(CallBlock, ++Call->getIterator()); + CallerIRBuilder.CreateLifetimeEnd(CallerArgStruct, ArgSize); + + if (TOI.ReplUnwind) + // We assume that qthread_fork_copyargs dealt with the exception. But + // replacing the invocation of the helper function with the call to + // qthread_fork_copyargs will remove the terminator from CallBlock. Restore + // that terminator here. + BranchInst::Create(TOI.ReplRet, CallBlock); + + // VERIFY: If we're using fork_copyargs, we don't need a separate helper + // function to manage the allocation of the argument structure. +} + +bool QthreadsABI::preProcessFunction(Function &F, TaskInfo &TI, + bool ProcessingTapirLoops) { + if (ProcessingTapirLoops) + // Don't do any preprocessing when outlining Tapir loops. + return false; + + LLVMContext &C = M.getContext(); + for (Task *T : post_order(TI.getRootTask())) { + if (T->isRootTask()) + continue; + DetachInst *Detach = T->getDetach(); + BasicBlock *detB = Detach->getParent(); + BasicBlock *Spawned = T->getEntry(); + Value *SR = Detach->getSyncRegion(); + Value *sinc = getOrCreateSinc(SR, &F); + + // Add an expect increment before spawning + IRBuilder<> preSpawnB(detB); + Value* one = ConstantInt::get(Type::getInt64Ty(C), 1); + std::vector expectArgs = {sinc, one}; + CallInst::Create(QTHREAD_FUNC(qt_sinc_expect), expectArgs, "", Detach); + + // Add a submit to end of task body + // + // TB: I would interpret the above comment to mean we want qt_sinc_submit() + // before the task terminates. But the code I see for inserting + // qt_sinc_submit just inserts the call at the end of the entry block of the + // task, which is not necessarily the end of the task. I kept the code I + // found, but I'm not sure if it is correct. + IRBuilder<> footerB(Spawned->getTerminator()); + Value* null = Constant::getNullValue(Type::getInt8PtrTy(C)); + std::vector submitArgs = {sinc, null}; + footerB.CreateCall(QTHREAD_FUNC(qt_sinc_submit), submitArgs); + } + return false; +} + +void QthreadsABI::postProcessFunction(Function &F, bool ProcessingTapirLoops) { + if (ProcessingTapirLoops) + // Don't do any preprocessing when outlining Tapir loops. + return; + + CallInst::Create(QTHREAD_FUNC(qthread_initialize), "", + F.getEntryBlock().getFirstNonPHIOrDbg()); +} + +void QthreadsABI::postProcessHelper(Function &F) {} + diff --git a/llvm/lib/Transforms/Tapir/SerialABI.cpp b/llvm/lib/Transforms/Tapir/SerialABI.cpp new file mode 100644 index 000000000000000..dc23f3b1e9c1778 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/SerialABI.cpp @@ -0,0 +1,52 @@ +//===- SerialABI.cpp - Replace Tapir with serial projection ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the SerialABI interface, which is used to convert Tapir +// instructions into their serial projection. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/SerialABI.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "serialabi" + +Value *SerialABI::lowerGrainsizeCall(CallInst *GrainsizeCall) { + Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 1); + + // Replace uses of grainsize intrinsic call with this grainsize value. + GrainsizeCall->replaceAllUsesWith(Grainsize); + return Grainsize; +} + +void SerialABI::lowerSync(SyncInst &SI) { + ReplaceInstWithInst(&SI, BranchInst::Create(SI.getSuccessor(0))); +} + +bool SerialABI::preProcessFunction(Function &F, TaskInfo &TI, + bool ProcessingTapirLoops) { + if (ProcessingTapirLoops) + // Don't do any preprocessing when outlining Tapir loops. + return false; + + bool Changed = false; + for (Task *T : post_order(TI.getRootTask())) { + if (T->isRootTask()) + continue; + DetachInst *DI = T->getDetach(); + SerializeDetach(DI, T); + Changed = true; + } + return Changed; +} + + diff --git a/llvm/lib/Transforms/Tapir/SerializeSmallTasks.cpp b/llvm/lib/Transforms/Tapir/SerializeSmallTasks.cpp new file mode 100644 index 000000000000000..52c866c30554549 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/SerializeSmallTasks.cpp @@ -0,0 +1,216 @@ +//===- SerializeSmallTasks.cpp - Serialize small Tapir tasks --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass serializes Tapir tasks with too little work to justify spawning. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/SerializeSmallTasks.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/WorkSpanAnalysis.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Tapir/LoopStripMine.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "serialize-small-tasks" + +static cl::opt SerializeUnprofitableLoops( + "serialize-unprofitable-loops", cl::Hidden, cl::init(true), + cl::desc("Serialize any Tapir tasks found to be unprofitable.")); + +static bool trySerializeSmallLoop( + Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, + const TargetTransformInfo &TTI, AssumptionCache &AC, TaskInfo *TI, + OptimizationRemarkEmitter &ORE, TargetLibraryInfo *TLI) { + bool Changed = false; + for (Loop *SubL : *L) + Changed |= trySerializeSmallLoop(SubL, DT, LI, SE, TTI, AC, TI, ORE, TLI); + + Task *T = getTaskIfTapirLoopStructure(L, TI); + if (!T) + return Changed; + + // Skip any loop for which stripmining is explicitly disabled. + if (TM_Disable == hasLoopStripmineTransformation(L)) + return Changed; + + TapirLoopHints Hints(L); + + TargetTransformInfo::StripMiningPreferences SMP = + gatherStripMiningPreferences(L, SE, TTI, std::nullopt); + + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + + WSCost LoopCost; + estimateLoopCost(LoopCost, L, LI, &SE, TTI, TLI, EphValues); + + // If the work in the loop is larger than the maximum value we can deal with, + // then it's not small. + if (LoopCost.UnknownCost) + return Changed; + + computeStripMineCount(L, TTI, LoopCost.Work, SMP); + // Make sure the count is a power of 2. + if (!isPowerOf2_32(SMP.Count)) + SMP.Count = NextPowerOf2(SMP.Count); + + // Find a constant trip count if available + unsigned ConstTripCount = getConstTripCount(L, SE); + + if (!ConstTripCount || SMP.Count < ConstTripCount) + return Changed; + + // Serialize the loop's detach, since it appears to be too small to be worth + // parallelizing. + ORE.emit([&]() { + return OptimizationRemark("serialize-small-tasks", + "SerializingSmallLoop", + L->getStartLoc(), L->getHeader()) + << "Serializing parallel loop that appears to be unprofitable " + << "to parallelize."; + }); + SerializeDetach(cast(L->getHeader()->getTerminator()), T, + /* ReplaceWithTaskFrame = */ taskContainsSync(T), &DT); + Hints.clearHintsMetadata(); + L->setDerivedFromTapirLoop(); + return true; +} + +namespace { +struct SerializeSmallTasks : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + SerializeSmallTasks() : FunctionPass(ID) { + initializeSerializeSmallTasksPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + } +}; +} + +char SerializeSmallTasks::ID = 0; +INITIALIZE_PASS_BEGIN(SerializeSmallTasks, "serialize-small-tasks", + "Serialize small Tapir tasks", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) +INITIALIZE_PASS_END(SerializeSmallTasks, "serialize-small-tasks", + "Serialize small Tapir tasks", false, false) + +namespace llvm { +FunctionPass *createSerializeSmallTasksPass() { + return new SerializeSmallTasks(); +} +} // end namespace llvm + +/// runOnFunction - Run through all tasks in the function and simplify them in +/// post order. +/// +bool SerializeSmallTasks::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + TaskInfo &TI = getAnalysis().getTaskInfo(); + if (TI.isSerial()) + return false; + + auto &TLI = getAnalysis().getTLI(F); + auto &DT = getAnalysis().getDomTree(); + LoopInfo *LI = &getAnalysis().getLoopInfo(); + ScalarEvolution &SE = getAnalysis().getSE(); + const TargetTransformInfo &TTI = + getAnalysis().getTTI(F); + auto &AC = getAnalysis().getAssumptionCache(F); + // For the old PM, we can't use OptimizationRemarkEmitter as an analysis + // pass. Function analyses need to be preserved across loop transformations + // but ORE cannot be preserved (see comment before the pass definition). + OptimizationRemarkEmitter ORE(&F); + + LLVM_DEBUG(dbgs() << "SerializeSmallTasks running on function " << F.getName() + << "\n"); + + bool Changed = false; + if (SerializeUnprofitableLoops) + for (Loop *L : *LI) + Changed |= trySerializeSmallLoop(L, DT, LI, SE, TTI, AC, &TI, ORE, &TLI); + + if (Changed) + // Recalculate TaskInfo + TI.recalculate(*DT.getRoot()->getParent(), DT); + + return Changed; +} + +PreservedAnalyses SerializeSmallTasksPass::run(Function &F, + FunctionAnalysisManager &AM) { + if (F.empty()) + return PreservedAnalyses::all(); + + TaskInfo &TI = AM.getResult(F); + if (TI.isSerial()) + return PreservedAnalyses::all(); + + auto &TLI = AM.getResult(F); + auto &SE = AM.getResult(F); + auto &LI = AM.getResult(F); + auto &TTI = AM.getResult(F); + auto &DT = AM.getResult(F); + auto &AC = AM.getResult(F); + auto &ORE = AM.getResult(F); + + + LLVM_DEBUG(dbgs() << "SerializeSmallTasks running on function " << F.getName() + << "\n"); + + bool Changed = false; + if (SerializeUnprofitableLoops) + for (Loop *L : LI) + Changed |= trySerializeSmallLoop(L, DT, &LI, SE, TTI, AC, &TI, ORE, &TLI); + + if (!Changed) + return PreservedAnalyses::all(); + + // Recalculate TaskInfo + TI.recalculate(*DT.getRoot()->getParent(), DT); + + PreservedAnalyses PA; + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + // TODO: Add more preserved analyses here. + return PA; +} diff --git a/llvm/lib/Transforms/Tapir/Tapir.cpp b/llvm/lib/Transforms/Tapir/Tapir.cpp new file mode 100644 index 000000000000000..bc4e056fc9aba70 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/Tapir.cpp @@ -0,0 +1,35 @@ +//===- Tapir.cpp ----------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements common infrastructure for libLLVMTapirOpts.a, which +// implements several transformations over the Tapir/LLVM intermediate +// representation, including the C bindings for that library. +// +//===----------------------------------------------------------------------===// + +#include "llvm-c/Transforms/Tapir.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include "llvm/Transforms/Tapir.h" + +using namespace llvm; + +/// initializeTapirOpts - Initialize all passes linked into the +/// TapirOpts library. +void llvm::initializeTapirOpts(PassRegistry &Registry) { + initializeLoopSpawningTIPass(Registry); + initializeLowerTapirToTargetPass(Registry); + initializeTaskCanonicalizePass(Registry); + initializeTaskSimplifyPass(Registry); + initializeDRFScopedNoAliasWrapperPassPass(Registry); + initializeLoopStripMinePass(Registry); + initializeSerializeSmallTasksPass(Registry); +} diff --git a/llvm/lib/Transforms/Tapir/TapirLoopInfo.cpp b/llvm/lib/Transforms/Tapir/TapirLoopInfo.cpp new file mode 100644 index 000000000000000..1846e2a782ad7ad --- /dev/null +++ b/llvm/lib/Transforms/Tapir/TapirLoopInfo.cpp @@ -0,0 +1,646 @@ +//===- TapirLoopInfo.cpp - Utility functions for Tapir loops --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements utility functions for handling Tapir loops. +// +// Many of these routines are adapted from +// Transforms/Vectorize/LoopVectorize.cpp. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/TapirLoopInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/Transforms/Tapir/LoweringUtils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "tapir" + +/// Create an analysis remark that explains why the transformation failed +/// +/// \p RemarkName is the identifier for the remark. If \p I is passed it is an +/// instruction that prevents the transformation. Otherwise \p TheLoop is used +/// for the location of the remark. \return the remark object that can be +/// streamed to. +/// +/// Based on createMissedAnalysis in the LoopVectorize pass. +OptimizationRemarkAnalysis +TapirLoopInfo::createMissedAnalysis(const char *PassName, StringRef RemarkName, + const Loop *TheLoop, Instruction *I) { + const Value *CodeRegion = TheLoop->getHeader(); + DebugLoc DL = TheLoop->getStartLoc(); + + if (I) { + CodeRegion = I->getParent(); + // If there is no debug location attached to the instruction, revert back to + // using the loop's. + if (I->getDebugLoc()) + DL = I->getDebugLoc(); + } + + OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); + R << "Tapir loop not transformed: "; + return R; +} + +/// Update information on this Tapir loop based on its metadata. +void TapirLoopInfo::readTapirLoopMetadata(OptimizationRemarkEmitter &ORE) { + TapirLoopHints Hints(getLoop()); + + // Get a grainsize for this Tapir loop from the metadata, if the metadata + // gives a grainsize. + Grainsize = Hints.getGrainsize(); +} + +static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { + if (Ty->isPointerTy()) + return DL.getIntPtrType(Ty); + + // It is possible that char's or short's overflow when we ask for the loop's + // trip count, work around this by changing the type size. + if (Ty->getScalarSizeInBits() < 32) + return Type::getInt32Ty(Ty->getContext()); + + return Ty; +} + +static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { + Ty0 = convertPointerToIntegerType(DL, Ty0); + Ty1 = convertPointerToIntegerType(DL, Ty1); + if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) + return Ty0; + return Ty1; +} + +/// Adds \p Phi, with induction descriptor ID, to the inductions list. This can +/// set \p Phi as the main induction of the loop if \p Phi is a better choice +/// for the main induction than the existing one. +void TapirLoopInfo::addInductionPhi(PHINode *Phi, + const InductionDescriptor &ID) { + Inductions[Phi] = ID; + + Type *PhiTy = Phi->getType(); + const DataLayout &DL = Phi->getModule()->getDataLayout(); + + // Int inductions are special because we only allow one IV. + if (ID.getKind() == InductionDescriptor::IK_IntInduction && + ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() && + isa(ID.getStartValue()) && + cast(ID.getStartValue())->isNullValue()) { + + // Get the widest type. + if (!WidestIndTy) + WidestIndTy = convertPointerToIntegerType(DL, PhiTy); + else + WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); + + // Use the phi node with the widest type as induction. Use the last + // one if there are multiple (no good reason for doing this other + // than it is expedient). We've checked that it begins at zero and + // steps by one, so this is a canonical induction variable. + if (!PrimaryInduction || PhiTy == WidestIndTy) + PrimaryInduction = Phi; + } + + // // Both the PHI node itself, and the "post-increment" value feeding + // // back into the PHI node may have external users. + // // We can allow those uses, except if the SCEVs we have for them rely + // // on predicates that only hold within the loop, since allowing the exit + // // currently means re-using this SCEV outside the loop. + // if (PSE.getUnionPredicate().isAlwaysTrue()) { + // AllowedExit.insert(Phi); + // AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); + // } + + LLVM_DEBUG(dbgs() << "TapirLoop: Found an induction variable: " << *Phi + << "\n"); +} + +/// Gather all induction variables in this loop that need special handling +/// during outlining. +bool TapirLoopInfo::collectIVs(PredicatedScalarEvolution &PSE, + const char *PassName, + OptimizationRemarkEmitter *ORE) { + Loop *L = getLoop(); + for (Instruction &I : *L->getHeader()) { + if (auto *Phi = dyn_cast(&I)) { + Type *PhiTy = Phi->getType(); + // Check that this PHI type is allowed. + if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && + !PhiTy->isPointerTy()) { + if (ORE) + ORE->emit(createMissedAnalysis(PassName, "CFGNotUnderstood", L, Phi) + << "loop control flow is not understood by loop spawning"); + LLVM_DEBUG(dbgs() << "TapirLoop: Found an non-int non-pointer PHI.\n"); + return false; + } + + // We only allow if-converted PHIs with exactly two incoming values. + if (Phi->getNumIncomingValues() != 2) { + if (ORE) + ORE->emit(createMissedAnalysis(PassName, "CFGNotUnderstood", L, Phi) + << "loop control flow is not understood by loop spawning"); + LLVM_DEBUG(dbgs() << "TapirLoop: Found an invalid PHI.\n"); + return false; + } + + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(Phi, L, PSE, ID)) { + LLVM_DEBUG(dbgs() << "\tFound induction PHI " << *Phi << "\n"); + addInductionPhi(Phi, ID); + // if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr) + // Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst()); + continue; + } + + // As a last resort, coerce the PHI to a AddRec expression and re-try + // classifying it a an induction PHI. + if (InductionDescriptor::isInductionPHI(Phi, L, PSE, ID, true)) { + LLVM_DEBUG(dbgs() << "\tCoerced induction PHI " << *Phi << "\n"); + addInductionPhi(Phi, ID); + continue; + } + + LLVM_DEBUG(dbgs() << "\tPassed PHI " << *Phi << "\n"); + } // end of PHI handling + } + + if (!PrimaryInduction) { + LLVM_DEBUG(dbgs() + << "TapirLoop: Did not find a primary integer induction var.\n"); + if (ORE) + ORE->emit(createMissedAnalysis(PassName, "NoInductionVariable", L) + << "canonical loop induction variable could not be identified"); + if (Inductions.empty()) + return false; + } + + // Now we know the widest induction type, check if our found induction is the + // same size. + // + // TODO: Check if this code is dead due to IndVarSimplify. + if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) + PrimaryInduction = nullptr; + + return true; +} + +/// Replace all induction variables in this loop that are not primary with +/// stronger forms. +void TapirLoopInfo::replaceNonPrimaryIVs(PredicatedScalarEvolution &PSE) { + BasicBlock *Header = getLoop()->getHeader(); + IRBuilder<> B(&*Header->getFirstInsertionPt()); + const DataLayout &DL = Header->getModule()->getDataLayout(); + SmallVector, 4> InductionsToRemove; + + // Replace all non-primary inductions with strengthened forms. + for (auto &InductionEntry : Inductions) { + PHINode *OrigPhi = InductionEntry.first; + InductionDescriptor II = InductionEntry.second; + if (OrigPhi == PrimaryInduction) continue; + LLVM_DEBUG(dbgs() << "Replacing Phi " << *OrigPhi << "\n"); + // If Induction is not canonical, replace it with some computation based on + // PrimaryInduction. + Type *StepType = II.getStep()->getType(); + Instruction::CastOps CastOp = + CastInst::getCastOpcode(PrimaryInduction, true, StepType, true); + Value *CRD = B.CreateCast(CastOp, PrimaryInduction, StepType, "cast.crd"); + Value *PhiRepl = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + PhiRepl->setName(OrigPhi->getName() + ".tl.repl"); + OrigPhi->replaceAllUsesWith(PhiRepl); + InductionsToRemove.push_back(InductionEntry); + } + + // Remove all inductions that were replaced from Inductions. + for (auto &InductionEntry : InductionsToRemove) { + PHINode *OrigPhi = InductionEntry.first; + OrigPhi->eraseFromParent(); + Inductions.erase(OrigPhi); + } +} + +bool TapirLoopInfo::getLoopCondition(const char *PassName, + OptimizationRemarkEmitter *ORE) { + Loop *L = getLoop(); + + // Check that the latch is terminated by a branch instruction. The + // LoopRotate pass can be helpful to ensure this property. + BranchInst *BI = + dyn_cast(L->getLoopLatch()->getTerminator()); + if (!BI || BI->isUnconditional()) { + LLVM_DEBUG(dbgs() + << "Loop-latch terminator is not a conditional branch.\n"); + if (ORE) + ORE->emit(TapirLoopInfo::createMissedAnalysis(PassName, "NoLatchBranch", + L) + << "loop latch is not terminated by a conditional branch"); + return false; + } + // Check that the condition is an integer-equality comparison. The + // IndVarSimplify pass should transform Tapir loops to use integer-equality + // comparisons when the loop can be analyzed. + { + const ICmpInst *Cond = dyn_cast(BI->getCondition()); + if (!Cond) { + LLVM_DEBUG(dbgs() << + "Loop-latch condition is not an integer comparison.\n"); + if (ORE) + ORE->emit(TapirLoopInfo::createMissedAnalysis(PassName, "NotIntCmp", L) + << "loop-latch condition is not an integer comparison"); + return false; + } + if (!Cond->isEquality()) { + LLVM_DEBUG(dbgs() << + "Loop-latch condition is not an equality comparison.\n"); + // TODO: Find a reasonable analysis message to give to users. + // if (ORE) + // ORE->emit(TapirLoopInfo::createMissedAnalysis(PassName, + // "NonCanonicalCmp", L) + // << "non-canonical loop-latch condition"); + return false; + } + } + Condition = dyn_cast(BI->getCondition()); + LLVM_DEBUG(dbgs() << "\tLoop condition " << *Condition << "\n"); + + if (Condition->getOperand(0) == PrimaryInduction || + Condition->getOperand(1) == PrimaryInduction) { + // The condition examines the primary induction before the increment. Check + // to see if the condition directs control to exit the loop once + // PrimaryInduction equals the end value. + if ((ICmpInst::ICMP_EQ == Condition->getPredicate() && + BI->getSuccessor(1) == L->getHeader()) || + (ICmpInst::ICMP_NE == Condition->getPredicate() && + BI->getSuccessor(0) == L->getHeader())) + // The end iteration is included in the loop bounds. + InclusiveRange = true; + } + + return true; +} + +static Value *getEscapeValue(Instruction *UI, const InductionDescriptor &II, + Value *TripCount, PredicatedScalarEvolution &PSE, + bool PostInc) { + const DataLayout &DL = UI->getModule()->getDataLayout(); + IRBuilder<> B(&*UI->getParent()->getFirstInsertionPt()); + Value *EffTripCount = TripCount; + if (!PostInc) + EffTripCount = B.CreateSub( + TripCount, ConstantInt::get(TripCount->getType(), 1)); + + Value *Count = !II.getStep()->getType()->isIntegerTy() + ? B.CreateCast(Instruction::SIToFP, EffTripCount, + II.getStep()->getType()) + : B.CreateSExtOrTrunc(EffTripCount, II.getStep()->getType()); + if (PostInc) + Count->setName("cast.count"); + else + Count->setName("cast.cmo"); + + Value *Escape = emitTransformedIndex(B, Count, PSE.getSE(), DL, II); + Escape->setName(UI->getName() + ".escape"); + return Escape; +} + +/// Fix up external users of the induction variable. We assume we are in LCSSA +/// form, with all external PHIs that use the IV having one input value, coming +/// from the remainder loop. We need those PHIs to also have a correct value +/// for the IV when arriving directly from the middle block. +void TapirLoopInfo::fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, + PredicatedScalarEvolution &PSE) { + // There are two kinds of external IV usages - those that use the value + // computed in the last iteration (the PHI) and those that use the penultimate + // value (the value that feeds into the phi from the loop latch). + // We allow both, but they, obviously, have different values. + assert(getExitBlock() && "Expected a single exit block"); + assert(getTripCount() && "Expected valid trip count"); + Loop *L = getLoop(); + Task *T = getTask(); + Value *TripCount = getTripCount(); + + DenseMap MissingVals; + + // An external user of the last iteration's value should see the value that + // the remainder loop uses to initialize its own IV. + Value *PostInc = OrigPhi->getIncomingValueForBlock(L->getLoopLatch()); + for (User *U : PostInc->users()) { + Instruction *UI = cast(U); + if (!L->contains(UI) && !T->encloses(UI->getParent())) { + assert(isa(UI) && "Expected LCSSA form"); + MissingVals[UI] = getEscapeValue(UI, II, TripCount, PSE, true); + } + } + + // An external user of the penultimate value needs to see TripCount - Step. + // The simplest way to get this is to recompute it from the constituent SCEVs, + // that is Start + (Step * (TripCount - 1)). + for (User *U : OrigPhi->users()) { + Instruction *UI = cast(U); + if (!L->contains(UI) && !T->encloses(UI->getParent())) { + assert(isa(UI) && "Expected LCSSA form"); + MissingVals[UI] = getEscapeValue(UI, II, TripCount, PSE, false); + } + } + + for (auto &I : MissingVals) { + LLVM_DEBUG(dbgs() << "Replacing external IV use:" << *I.first << " with " + << *I.second << "\n"); + PHINode *PHI = cast(I.first); + PHI->replaceAllUsesWith(I.second); + PHI->eraseFromParent(); + } +} + +const SCEV *TapirLoopInfo::getBackedgeTakenCount( + PredicatedScalarEvolution &PSE) const { + Loop *L = getLoop(); + ScalarEvolution *SE = PSE.getSE(); + const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); + if (BackedgeTakenCount == SE->getCouldNotCompute()) + BackedgeTakenCount = SE->getExitCount(L, L->getLoopLatch()); + + if (BackedgeTakenCount == SE->getCouldNotCompute()) + return BackedgeTakenCount; + + Type *IdxTy = getWidestInductionType(); + + // The exit count might have the type of i64 while the phi is i32. This can + // happen if we have an induction variable that is sign extended before the + // compare. The only way that we get a backedge taken count is that the + // induction variable was signed and as such will not overflow. In such a case + // truncation is legal. + if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > + IdxTy->getPrimitiveSizeInBits()) + BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); + BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); + + return BackedgeTakenCount; +} + +const SCEV *TapirLoopInfo::getExitCount(const SCEV *BackedgeTakenCount, + PredicatedScalarEvolution &PSE) const { + ScalarEvolution *SE = PSE.getSE(); + const SCEV *ExitCount; + if (InclusiveRange) + ExitCount = BackedgeTakenCount; + else + // Get the total trip count from the count by adding 1. + ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + return ExitCount; +} + +/// Returns (and creates if needed) the original loop trip count. +Value *TapirLoopInfo::getOrCreateTripCount(PredicatedScalarEvolution &PSE, + const char *PassName, + OptimizationRemarkEmitter *ORE) { + if (TripCount) + return TripCount; + Loop *L = getLoop(); + + // Get the existing SSA value being used for the end condition of the loop. + if (!Condition) + if (!getLoopCondition(PassName, ORE)) + return nullptr; + + Value *ConditionEnd = Condition->getOperand(0); + { + if (!L->isLoopInvariant(ConditionEnd)) { + if (!L->isLoopInvariant(Condition->getOperand(1))) + return nullptr; + ConditionEnd = Condition->getOperand(1); + } + } + assert(L->isLoopInvariant(ConditionEnd) && + "Condition end is not loop invariant."); + + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + ScalarEvolution *SE = PSE.getSE(); + + // Find the loop boundaries. + const SCEV *BackedgeTakenCount = SE->getExitCount(L, L->getLoopLatch()); + + if (BackedgeTakenCount == SE->getCouldNotCompute()) { + LLVM_DEBUG(dbgs() << "Could not compute backedge-taken count.\n"); + return nullptr; + } + + const SCEV *ExitCount = getExitCount(BackedgeTakenCount, PSE); + + if (ExitCount == SE->getSCEV(ConditionEnd)) { + TripCount = ConditionEnd; + return TripCount; + } + + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + Type *IdxTy = getWidestInductionType(); + + // Expand the trip count and place the new instructions in the preheader. + // Notice that the pre-header does not change, only the loop body. + SCEVExpander Exp(*SE, DL, "induction"); + + // Count holds the overall loop count (N). + TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + L->getLoopPreheader()->getTerminator()); + + if (TripCount->getType()->isPointerTy()) + TripCount = + CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", + L->getLoopPreheader()->getTerminator()); + + // Try to use the existing ConditionEnd for the trip count. + if (TripCount != ConditionEnd) { + // Compare the SCEV's of the TripCount and ConditionEnd to see if they're + // equal. Normalize these SCEV types to be IdxTy. + const SCEV *TripCountSCEV = + SE->getNoopOrAnyExtend(SE->getSCEV(TripCount), IdxTy); + const SCEV *ConditionEndSCEV = + SE->getNoopOrAnyExtend(SE->getSCEV(ConditionEnd), IdxTy); + if (SE->getMinusSCEV(TripCountSCEV, ConditionEndSCEV)->isZero()) + TripCount = ConditionEnd; + } + + return TripCount; +} + +/// Top-level call to prepare a Tapir loop for outlining. +bool TapirLoopInfo::prepareForOutlining( + DominatorTree &DT, LoopInfo &LI, TaskInfo &TI, + PredicatedScalarEvolution &PSE, AssumptionCache &AC, const char *PassName, + OptimizationRemarkEmitter &ORE, const TargetTransformInfo &TTI) { + LLVM_DEBUG(dbgs() << "Preparing loop for outlining " << *getLoop() << "\n"); + + // Collect the IVs in this loop. + collectIVs(PSE, PassName, &ORE); + + // If no primary induction was found, just bail. + if (!PrimaryInduction) + return false; + + LLVM_DEBUG(dbgs() << "\tPrimary induction " << *PrimaryInduction << "\n"); + + // Replace any non-primary IV's. + replaceNonPrimaryIVs(PSE); + + // Compute the trip count for this loop. + // + // We need the trip count for two reasons. + // + // 1) In the call to the helper that will replace this loop, we need to pass + // the total number of loop iterations. + // + // 2) In the helper itself, the strip-mined loop must iterate to the + // end-iteration argument, not the total number of iterations. + Value *TripCount = getOrCreateTripCount(PSE, PassName, &ORE); + if (!TripCount) { + ORE.emit(createMissedAnalysis(PassName, "NoTripCount", getLoop()) + << "could not compute finite loop trip count."); + return false; + } + + LLVM_DEBUG(dbgs() << "\tTrip count " << *TripCount << "\n"); + + // If necessary, rewrite the loop condition to use TripCount. This code + // should run very rarely, since IndVarSimplify should have already simplified + // the loop's induction variables. + if ((Condition->getOperand(0) != TripCount) && + (Condition->getOperand(1) != TripCount)) { + Loop *L = getLoop(); + // For now, we don't handle the case where there are multiple uses of the + // condition. + assert(Condition->hasOneUse() && + "Attempting to rewrite Condition with multiple uses."); + // Get the IV to use for the new condition: either PrimaryInduction or its + // incremented value, depending on whether the range is inclusive. + Value *IVForCond = + InclusiveRange + ? PrimaryInduction + : PrimaryInduction->getIncomingValueForBlock(L->getLoopLatch()); + // Get the parity of the LoopLatch terminator, i.e., whether the true or + // false branch is the backedge. + BranchInst *BI = dyn_cast(L->getLoopLatch()->getTerminator()); + bool BEBranchParity = (BI->getSuccessor(0) == L->getHeader()); + // Create the new Condition + ICmpInst *NewCond = + new ICmpInst(BEBranchParity ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ, + IVForCond, TripCount); + NewCond->setDebugLoc(Condition->getDebugLoc()); + // Replace the old Condition with the new Condition. + ReplaceInstWithInst(Condition, NewCond); + Condition = NewCond; + } + + // FIXME: This test is probably too simple. + assert(((Condition->getOperand(0) == TripCount) || + (Condition->getOperand(1) == TripCount)) && + "Condition does not use trip count."); + + // Fixup all external uses of the IVs. + for (auto &InductionEntry : Inductions) + fixupIVUsers(InductionEntry.first, InductionEntry.second, PSE); + + return true; +} + +/// Transforms an induction descriptor into a direct computation of its value at +/// Index. +/// +/// Copied from lib/Transforms/Vectorize/LoopVectorize.cpp +Value *llvm::emitTransformedIndex( + IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, + const InductionDescriptor &ID) { + + SCEVExpander Exp(*SE, DL, "induction"); + auto Step = ID.getStep(); + auto StartValue = ID.getStartValue(); + assert(Index->getType() == Step->getType() && + "Index type does not match StepValue type"); + + // Note: the IR at this point is broken. We cannot use SE to create any new + // SCEV and then expand it, hoping that SCEV's simplification will give us + // a more optimal code. Unfortunately, attempt of doing so on invalid IR may + // lead to various SCEV crashes. So all we can do is to use builder and rely + // on InstCombine for future simplifications. Here we handle some trivial + // cases only. + auto CreateAdd = [&B](Value *X, Value *Y) { + assert(X->getType() == Y->getType() && "Types don't match!"); + if (auto *CX = dyn_cast(X)) + if (CX->isZero()) + return Y; + if (auto *CY = dyn_cast(Y)) + if (CY->isZero()) + return X; + return B.CreateAdd(X, Y); + }; + + auto CreateMul = [&B](Value *X, Value *Y) { + assert(X->getType() == Y->getType() && "Types don't match!"); + if (auto *CX = dyn_cast(X)) + if (CX->isOne()) + return Y; + if (auto *CY = dyn_cast(Y)) + if (CY->isOne()) + return X; + return B.CreateMul(X, Y); + }; + + switch (ID.getKind()) { + case InductionDescriptor::IK_IntInduction: { + assert(Index->getType() == StartValue->getType() && + "Index type does not match StartValue type"); + if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) + return B.CreateSub(StartValue, Index); + auto *Offset = CreateMul( + Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); + return CreateAdd(StartValue, Offset); + } + case InductionDescriptor::IK_PtrInduction: { + assert(isa(Step) && + "Expected constant step for pointer induction"); + return B.CreateGEP( + nullptr, StartValue, + CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), + &*B.GetInsertPoint()))); + } + case InductionDescriptor::IK_FpInduction: { + assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); + auto InductionBinOp = ID.getInductionBinOp(); + assert(InductionBinOp && + (InductionBinOp->getOpcode() == Instruction::FAdd || + InductionBinOp->getOpcode() == Instruction::FSub) && + "Original bin op should be defined for FP induction"); + + Value *StepValue = cast(Step)->getValue(); + + // Floating point operations had to be 'fast' to enable the induction. + FastMathFlags Flags; + Flags.setFast(); + + Value *MulExp = B.CreateFMul(StepValue, Index); + if (isa(MulExp)) + // We have to check, the MulExp may be a constant. + cast(MulExp)->setFastMathFlags(Flags); + + Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, + "induction"); + if (isa(BOp)) + cast(BOp)->setFastMathFlags(Flags); + + return BOp; + } + case InductionDescriptor::IK_NoInduction: + return nullptr; + } + llvm_unreachable("invalid enum"); +} diff --git a/llvm/lib/Transforms/Tapir/TapirToTarget.cpp b/llvm/lib/Transforms/Tapir/TapirToTarget.cpp new file mode 100644 index 000000000000000..ed7c308b34edbea --- /dev/null +++ b/llvm/lib/Transforms/Tapir/TapirToTarget.cpp @@ -0,0 +1,605 @@ +//===- TapirToTarget.cpp - Convert Tapir into parallel-runtime calls ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass converts functions that use Tapir instructions to call out to a +// target parallel runtime system. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/TapirToTarget.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Timer.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Tapir.h" +#include "llvm/Transforms/Tapir/LoweringUtils.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +#define DEBUG_TYPE "tapir2target" + +using namespace llvm; + +cl::opt DebugABICalls( + "debug-abi-calls", cl::init(false), cl::Hidden, + cl::desc("Insert ABI calls simply, to debug generated IR")); + +cl::opt UseExternalABIFunctions( + "use-external-abi-functions", cl::init(false), cl::Hidden, + cl::desc("Use ABI functions defined externally, rather than " + "compiler-generated versions")); + +static const char TimerGroupName[] = DEBUG_TYPE; +static const char TimerGroupDescription[] = "Tapir to Target"; + +class TapirToTargetImpl { +public: + TapirToTargetImpl(Module &M, function_ref GetAA, + function_ref GetDT, + function_ref GetTI, + function_ref GetAC, + function_ref GetTLI) + : M(M), GetAA(GetAA), GetDT(GetDT), GetTI(GetTI), GetAC(GetAC), + GetTLI(GetTLI) + {} + ~TapirToTargetImpl() { + if (Target) + delete Target; + } + + bool run(); + +private: + bool unifyReturns(Function &F); + bool processFunction(Function &F, SmallVectorImpl &NewHelpers); + TFOutlineMapTy outlineAllTasks(Function &F, + SmallVectorImpl &AllTaskFrames, + OutlineAnalysis &OA, TaskInfo &TI); + bool processSimpleABI(Function &F, BasicBlock *TFEntry); + bool processRootTask(Function &F, TFOutlineMapTy &TFToOutline, + OutlineAnalysis &OA, TaskInfo &TI); + bool processSpawnerTaskFrame(Spindle *TF, TFOutlineMapTy &TFToOutline, + OutlineAnalysis &OA, TaskInfo &TI); + bool processOutlinedTask(Task *T, TFOutlineMapTy &TFToOutline, + OutlineAnalysis &OA, TaskInfo &TI); + +private: + TapirTarget *Target = nullptr; + + Module &M; + + function_ref GetAA; + function_ref GetDT; + function_ref GetTI; + function_ref GetAC; + function_ref GetTLI; +}; + +bool TapirToTargetImpl::unifyReturns(Function &F) { + NamedRegionTimer NRT("unifyReturns", "Unify returns", TimerGroupName, + TimerGroupDescription, TimePassesIsEnabled); + SmallVector ReturningBlocks; + for (BasicBlock &BB : F) + if (isa(BB.getTerminator())) + ReturningBlocks.push_back(&BB); + + // If this function already has no returns or a single return, then terminate + // early. + if (ReturningBlocks.size() <= 1) + return false; + + BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), + "UnifiedReturnBlock", &F); + PHINode *PN = nullptr; + if (F.getReturnType()->isVoidTy()) { + ReturnInst::Create(F.getContext(), nullptr, NewRetBlock); + } else { + // If the function doesn't return void... add a PHI node to the block... + PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(), + "UnifiedRetVal", NewRetBlock); + ReturnInst::Create(F.getContext(), PN, NewRetBlock); + } + + // Loop over all of the blocks, replacing the return instruction with an + // unconditional branch. + // + for (BasicBlock *BB : ReturningBlocks) { + // Add an incoming element to the PHI node for every return instruction that + // is merging into this new block... + if (PN) + PN->addIncoming(BB->getTerminator()->getOperand(0), BB); + + BB->back().eraseFromParent(); // Remove the return insn + BranchInst::Create(NewRetBlock, BB); + } + return true; +} + +/// Outline all tasks in this function in post order. +TFOutlineMapTy +TapirToTargetImpl::outlineAllTasks(Function &F, + SmallVectorImpl &AllTaskFrames, + OutlineAnalysis &OA, TaskInfo &TI) { + NamedRegionTimer NRT("outlineAllTasks", "Outline all tasks", TimerGroupName, + TimerGroupDescription, TimePassesIsEnabled); + TFOutlineMapTy TFToOutline; + + // Determine the inputs for all tasks. + TFValueSetMap TFInputs, TFOutputs; + findAllTaskFrameInputs(TFInputs, TFOutputs, AllTaskFrames, F, OA.DT, TI); + + DenseMap> HelperInputs; + + for (Spindle *TF : AllTaskFrames) { + // At this point, all subtaskframess of TF must have been processed. + // Replace the tasks with calls to their outlined helper functions. + for (Spindle *SubTF : TF->subtaskframes()) + TFToOutline[SubTF].replaceReplCall( + replaceTaskFrameWithCallToOutline(SubTF, TFToOutline[SubTF], + HelperInputs[SubTF])); + + // TODO: Add support for outlining taskframes with no associated task. Such + // a facility would allow the frontend to create nested sync regions that + // are properly outlined. + + Task *T = TF->getTaskFromTaskFrame(); + if (!T) { + ValueToValueMapTy VMap; + ValueToValueMapTy InputMap; + TFToOutline[TF] = outlineTaskFrame(TF, TFInputs[TF], HelperInputs[TF], + &Target->getDestinationModule(), VMap, + Target->getArgStructMode(), + Target->getReturnType(), InputMap, OA); + // If the taskframe TF does not catch an exception from the taskframe, + // then the outlined function cannot throw. + if (F.doesNotThrow() && !getTaskFrameResume(TF->getTaskFrameCreate())) + TFToOutline[TF].Outline->setDoesNotThrow(); + // Don't inherit the noreturn attribute from the caller. + if (F.doesNotReturn()) + TFToOutline[TF].Outline->removeFnAttr(Attribute::NoReturn); + Target->addHelperAttributes(*TFToOutline[TF].Outline); + + // Allow the Target to update any internal structures after outlining. + Target->remapAfterOutlining(TF->getEntry(), VMap); + + // Update subtaskframe outline info to reflect the fact that their parent + // taskframe was outlined. + for (Spindle *SubTF : TF->subtaskframes()) + TFToOutline[SubTF].remapOutlineInfo(VMap, InputMap); + + continue; + } + + // Outline the task, if necessary, and add the outlined function to the + // mapping. + + ValueToValueMapTy VMap; + ValueToValueMapTy InputMap; + TFToOutline[TF] = outlineTask(T, TFInputs[TF], HelperInputs[TF], + &Target->getDestinationModule(), VMap, + Target->getArgStructMode(), + Target->getReturnType(), InputMap, OA); + // If the detach for task T does not catch an exception from the task, then + // the outlined function cannot throw. + if (F.doesNotThrow() && !T->getDetach()->hasUnwindDest()) + TFToOutline[TF].Outline->setDoesNotThrow(); + Target->addHelperAttributes(*TFToOutline[TF].Outline); + + // Update subtask outline info to reflect the fact that their spawner was + // outlined. + for (Spindle *SubTF : TF->subtaskframes()) + TFToOutline[SubTF].remapOutlineInfo(VMap, InputMap); + } + + // Insert calls to outlined helpers for taskframe roots. + for (Spindle *TF : TI.getRootTask()->taskframe_roots()) + TFToOutline[TF].replaceReplCall( + replaceTaskFrameWithCallToOutline(TF, TFToOutline[TF], + HelperInputs[TF])); + + return TFToOutline; +} + +/// Process the Tapir instructions in function \p F directly. +bool TapirToTargetImpl::processSimpleABI(Function &F, BasicBlock *TFEntry) { + NamedRegionTimer NRT("processSimpleABI", "Process simple ABI", TimerGroupName, + TimerGroupDescription, TimePassesIsEnabled); + + // Get the simple Tapir instructions to process, including syncs and + // loop-grainsize calls. + SmallVector Syncs; + SmallVector GrainsizeCalls; + SmallVector TaskFrameAddrCalls; + SmallVector TapirRTCalls; + SmallVector ReducerOperations; + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + // Record calls to get Tapir-loop grainsizes. + if (IntrinsicInst *II = dyn_cast(&I)) + if (Intrinsic::tapir_loop_grainsize == II->getIntrinsicID()) + GrainsizeCalls.push_back(II); + + // Record calls to task_frameaddr intrinsics. + if (IntrinsicInst *II = dyn_cast(&I)) + if (Intrinsic::task_frameaddress == II->getIntrinsicID()) + TaskFrameAddrCalls.push_back(II); + + // Record calls to tapir_runtime_start intrinsics. We rely on analyzing + // uses of these intrinsic calls to find calls to tapir_runtime_end. + if (IntrinsicInst *II = dyn_cast(&I)) + if (Intrinsic::tapir_runtime_start == II->getIntrinsicID()) + TapirRTCalls.push_back(II); + + // Record sync instructions in this function. + if (SyncInst *SI = dyn_cast(&I)) + Syncs.push_back(SI); + + if (!dyn_cast(&I)) + continue; + } + } + + // Lower simple Tapir instructions in this function. Collect the set of + // helper functions generated by this process. + bool Changed = false; + + // Lower calls to get Tapir-loop grainsizes. + while (!GrainsizeCalls.empty()) { + CallInst *GrainsizeCall = GrainsizeCalls.pop_back_val(); + LLVM_DEBUG(dbgs() << "Lowering grainsize call " << *GrainsizeCall << "\n"); + Target->lowerGrainsizeCall(GrainsizeCall); + Changed = true; + } + + // Lower calls to task_frameaddr intrinsics. + while (!TaskFrameAddrCalls.empty()) { + CallInst *TaskFrameAddrCall = TaskFrameAddrCalls.pop_back_val(); + LLVM_DEBUG(dbgs() << "Lowering task_frameaddr call " << *TaskFrameAddrCall + << "\n"); + Target->lowerTaskFrameAddrCall(TaskFrameAddrCall); + Changed = true; + } + Target->lowerTapirRTCalls(TapirRTCalls, F, TFEntry); + + // Process the set of syncs. + while (!Syncs.empty()) { + SyncInst *SI = Syncs.pop_back_val(); + Target->lowerSync(*SI); + Changed = true; + } + + while (!ReducerOperations.empty()) { + CallBase *CI = ReducerOperations.pop_back_val(); + Target->lowerReducerOperation(CI); + Changed = true; + } + + return Changed; +} + +bool TapirToTargetImpl::processRootTask( + Function &F, TFOutlineMapTy &TFToOutline, OutlineAnalysis &OA, + TaskInfo &TI) { + NamedRegionTimer NRT("processRootTask", "Process root task", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + bool Changed = false; + // Check if the root task performs a spawn + bool PerformsSpawn = false; + for (Spindle *TF : TI.getRootTask()->taskframe_roots()) { + if (TF->getTaskFromTaskFrame()) { + PerformsSpawn = true; + break; + } + } + if (PerformsSpawn) { + Changed = true; + // Process root-task function F as a spawner. + Target->preProcessRootSpawner(F, &F.getEntryBlock()); + + // Process each call to a subtask. + for (Spindle *TF : TI.getRootTask()->taskframe_roots()) + if (TF->getTaskFromTaskFrame()) + Target->processSubTaskCall(TFToOutline[TF], OA.DT); + + Target->postProcessRootSpawner(F, &F.getEntryBlock()); + } + // Process the Tapir instructions in F directly. + Changed |= processSimpleABI(F, &F.getEntryBlock()); + return Changed; +} + +bool TapirToTargetImpl::processSpawnerTaskFrame( + Spindle *TF, TFOutlineMapTy &TFToOutline, OutlineAnalysis &OA, + TaskInfo &TI) { + NamedRegionTimer NRT("processSpawnerTaskFrame", "Process spawner taskframe", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + Function &F = *TFToOutline[TF].Outline; + + // Process function F as a spawner. + Target->preProcessRootSpawner(F, TF->getEntry()); + + // Process each call to a subtask. + for (Spindle *SubTF : TF->subtaskframes()) + if (SubTF->getTaskFromTaskFrame()) + Target->processSubTaskCall(TFToOutline[SubTF], OA.DT); + + Target->postProcessRootSpawner(F, TF->getEntry()); + + // Process the Tapir instructions in F directly. + processSimpleABI(F, TF->getEntry()); + return true; +} + +bool TapirToTargetImpl::processOutlinedTask( + Task *T, TFOutlineMapTy &TFToOutline, OutlineAnalysis &OA, TaskInfo &TI) { + NamedRegionTimer NRT("processOutlinedTask", "Process outlined task", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + Spindle *TF = getTaskFrameForTask(T); + Function &F = *TFToOutline[TF].Outline; + + Instruction *DetachPt = TFToOutline[TF].DetachPt; + Instruction *TaskFrameCreate = TFToOutline[TF].TaskFrameCreate; + + Target->preProcessOutlinedTask(F, DetachPt, TaskFrameCreate, !T->isSerial(), + TF->getEntry()); + // Process each call to a subtask. + for (Spindle *SubTF : TF->subtaskframes()) + if (SubTF->getTaskFromTaskFrame()) + Target->processSubTaskCall(TFToOutline[SubTF], OA.DT); + + Target->postProcessOutlinedTask(F, DetachPt, TaskFrameCreate, !T->isSerial(), + TF->getEntry()); + + // Process the Tapir instructions in F directly. + processSimpleABI(F, TF->getEntry()); + return true; +} + +// Helper method to check if the given taskframe spindle performs any spawns. +static bool isSpawningTaskFrame(const Spindle *TF) { + for (const Spindle *SubTF : TF->subtaskframes()) + if (SubTF->getTaskFromTaskFrame()) + return true; + return false; +} + +// Helper method to check if the given taskframe corresponds to a spawned task. +static bool isSpawnedTaskFrame(const Spindle *TF) { + return TF->getTaskFromTaskFrame(); +} + +bool TapirToTargetImpl::processFunction( + Function &F, SmallVectorImpl &NewHelpers) { + LLVM_DEBUG(dbgs() << "Tapir: Processing function " << F.getName() << "\n"); + + // Get the necessary analysis results. + OutlineAnalysis OA(GetAA(F), GetAC(F), GetDT(F)); + TaskInfo &TI = GetTI(F); + splitTaskFrameCreateBlocks(F, &OA.DT, &TI); + TI.findTaskFrameTree(); + + bool ChangedCFG = false; + { + NamedRegionTimer NRT("TargetPreProcess", "Target preprocessing", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + ChangedCFG = Target->preProcessFunction(F, TI); + } // end timed region + + // If we don't need to do outlining, then just handle the simple ABI. + if (!Target->shouldDoOutlining(F)) { + // Process the Tapir instructions in F directly. + if (!Target->processOrdinaryFunction(F, &F.getEntryBlock())) + processSimpleABI(F, &F.getEntryBlock()); + return ChangedCFG; + } + + // Traverse the tasks in this function in post order. + SmallVector AllTaskFrames; + + // Collect all taskframes in the function in postorder. + for (Spindle *TFRoot : TI.getRootTask()->taskframe_roots()) + for (Spindle *TFSpindle : post_order>(TFRoot)) + AllTaskFrames.push_back(TFSpindle); + + // Fixup external uses of values defined in taskframes. + for (Spindle *TF : AllTaskFrames) + fixupTaskFrameExternalUses(TF, TI, OA.DT); + + // Outline all tasks in a target-oblivious manner. + TFOutlineMapTy TFToOutline = outlineAllTasks(F, AllTaskFrames, OA, TI); + + // Perform target-specific processing of this function and all newly created + // helpers. + for (Spindle *TF : AllTaskFrames) { + if (isSpawningTaskFrame(TF) && !isSpawnedTaskFrame(TF)) + processSpawnerTaskFrame(TF, TFToOutline, OA, TI); + else if (isSpawnedTaskFrame(TF)) + processOutlinedTask(TF->getTaskFromTaskFrame(), TFToOutline, OA, TI); + else + if (!Target->processOrdinaryFunction(*TFToOutline[TF].Outline, + TF->getEntry())) + processSimpleABI(*TFToOutline[TF].Outline, TF->getEntry()); + NewHelpers.push_back(TFToOutline[TF].Outline); + } + // Process the root task + processRootTask(F, TFToOutline, OA, TI); + + { + NamedRegionTimer NRT("TargetPostProcess", "Target postprocessing", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + Target->postProcessFunction(F); + for (Function *H : NewHelpers) + Target->postProcessHelper(*H); + } // end timed region + + LLVM_DEBUG({ + NamedRegionTimer NRT("FunctionVerify", + "Post-lowering function verification", TimerGroupName, + TimerGroupDescription, TimePassesIsEnabled); + if (verifyFunction(F, &errs())) { + LLVM_DEBUG(dbgs() << "Function after lowering:" << F); + llvm_unreachable("Tapir lowering produced bad IR!"); + } + for (Function *H : NewHelpers) + if (verifyFunction(*H, &errs())) { + LLVM_DEBUG(dbgs() << "Function after lowering:" << *H); + llvm_unreachable("Tapir lowering produced bad IR!"); + } + }); + + return ChangedCFG || !NewHelpers.empty(); +} + +bool TapirToTargetImpl::run() { + // Add functions that detach to the work list. + SmallVector WorkList; + { + NamedRegionTimer NRT("shouldProcessFunction", "Find functions to process", + TimerGroupName, TimerGroupDescription, + TimePassesIsEnabled); + for (Function &F : M) { + if (F.empty()) + continue; + // TODO: Use per-function Tapir targets? + if (!Target) { + TargetLibraryInfo &TLI = GetTLI(F); + Target = getTapirTargetFromID(M, TLI.getTapirTarget()); + if (TapirTargetOptions *Options = TLI.getTapirTargetOptions()) + Target->setOptions(*Options); + } + assert(Target && "Missing Tapir target"); + if (Target->shouldProcessFunction(F)) + WorkList.push_back(&F); + } + } + + // Quit early if there are no functions in this module to lower. + if (WorkList.empty()) + return false; + + // There are functions in this module to lower. Prepare the module for Tapir + // lowering. + Target->prepareModule(); + + bool Changed = false; + while (!WorkList.empty()) { + // Process the next function. + Function *F = WorkList.pop_back_val(); + SmallVector NewHelpers; + + Changed |= processFunction(*F, NewHelpers); + + // Check the generated helper functions to see if any need to be processed, + // that is, to see if any of them themselves detach a subtask. + { + NamedRegionTimer NRT("shouldProcessHelper", + "Find helper functions to process", TimerGroupName, + TimerGroupDescription, TimePassesIsEnabled); + for (Function *Helper : NewHelpers) + if (Target->shouldProcessFunction(*Helper)) + WorkList.push_back(Helper); + } + } + return Changed; +} + +PreservedAnalyses TapirToTargetPass::run(Module &M, ModuleAnalysisManager &AM) { + auto &FAM = AM.getResult(M).getManager(); + auto GetAA = [&FAM](Function &F) -> AAResults & { + return FAM.getResult(F); + }; + auto GetDT = [&FAM](Function &F) -> DominatorTree & { + return FAM.getResult(F); + }; + auto GetTI = [&FAM](Function &F) -> TaskInfo & { + return FAM.getResult(F); + }; + auto GetAC = [&FAM](Function &F) -> AssumptionCache & { + return FAM.getResult(F); + }; + auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult(F); + }; + + bool Changed = TapirToTargetImpl(M, GetAA, GetDT, GetTI, GetAC, GetTLI).run(); + + if (Changed) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +namespace { +struct LowerTapirToTarget : public ModulePass { + static char ID; // Pass identification, replacement for typeid + explicit LowerTapirToTarget() : ModulePass(ID) { + initializeLowerTapirToTargetPass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { return "Lower Tapir to target"; } + + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } +}; +} // End of anonymous namespace + +char LowerTapirToTarget::ID = 0; +INITIALIZE_PASS_BEGIN(LowerTapirToTarget, "tapir2target", + "Lower Tapir to Target ABI", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) +INITIALIZE_PASS_END(LowerTapirToTarget, "tapir2target", + "Lower Tapir to Target ABI", false, false) + +bool LowerTapirToTarget::runOnModule(Module &M) { + if (skipModule(M)) + return false; + auto GetAA = [this](Function &F) -> AAResults & { + return this->getAnalysis(F).getAAResults(); + }; + auto GetDT = [this](Function &F) -> DominatorTree & { + return this->getAnalysis(F).getDomTree(); + }; + auto GetTI = [this](Function &F) -> TaskInfo & { + return this->getAnalysis(F).getTaskInfo(); + }; + AssumptionCacheTracker *ACT = &getAnalysis(); + auto GetAC = [&ACT](Function &F) -> AssumptionCache & { + return ACT->getAssumptionCache(F); + }; + auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis().getTLI(F); + }; + + return TapirToTargetImpl(M, GetAA, GetDT, GetTI, GetAC, GetTLI).run(); +} + +// createLowerTapirToTargetPass - Provide an entry point to create this pass. +// +namespace llvm { +ModulePass *createLowerTapirToTargetPass() { return new LowerTapirToTarget(); } +} // namespace llvm diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 79911bf563ea42f..20664050cb5cf43 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -196,6 +196,17 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU, Instruction *PTI = PredBB->getTerminator(); if (PTI->isSpecialTerminator() || PTI->mayHaveSideEffects()) return false; + // Don't break syncs. + if (isa(PredBB->getTerminator())) return false; + // Don't break entry blocks of detached CFG's. + for (pred_iterator PI = pred_begin(PredBB), PE = pred_end(PredBB); + PI != PE; ++PI) { + BasicBlock *PredPredBB = *PI; + if (const DetachInst *DI = + dyn_cast(PredPredBB->getTerminator())) + if (DI->getDetached() == PredBB) + return false; + } // Can't merge if there are multiple distinct successors. if (!PredecessorWithTwoSuccessors && PredBB->getUniqueSuccessor() != BB) @@ -793,7 +804,43 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT, // block. assert(BB->getTerminator()->getNumSuccessors() == 1 && "Should have a single succ!"); - return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU, BBName); + if (SyncInst *OldSI = dyn_cast(BB->getTerminator())) { + // Insert a new basic block after BB. + std::string Name = BBName.str(); + BasicBlock *NewBB = BasicBlock::Create( + BB->getContext(), Name.empty() ? BB->getName() + ".split" : Name, + BB->getParent(), BB->getNextNode()); + DebugLoc Loc = Succ->front().getDebugLoc(); + // Terminate that block with an unconditional branch to Succ. + BranchInst::Create(Succ, NewBB)->setDebugLoc(Loc); + // Update the successor of the sync instruction to be NewBB. + OldSI->setSuccessor(0, NewBB); + // Update any PHI ndes in Succ. + NewBB->replaceSuccessorsPhiUsesWith(BB, NewBB); + + // The new block lives in whichever loop the old one did. This preserves + // LCSSA as well, because we force the split point to be after any PHI + // nodes. + if (LI) + if (Loop *L = LI->getLoopFor(BB)) + L->addBasicBlockToLoop(NewBB, *LI); + + if (DT) + // Old dominates New. New node dominates all other nodes dominated by Old. + if (DomTreeNode *OldNode = DT->getNode(BB)) { + std::vector Children(OldNode->begin(), OldNode->end()); + + DomTreeNode *NewNode = DT->addNewBlock(NewBB, BB); + for (DomTreeNode *I : Children) + DT->changeImmediateDominator(I, NewNode); + } + + // Note: We don't need to update MSSA in this case, because the sync + // instruction remains in the original basic block. + return NewBB; + } else { + return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU, BBName); + } } void llvm::setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) { diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index 4606514cbc71757..3ab73c69b27c877 100644 --- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -31,6 +31,7 @@ #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; @@ -113,10 +114,29 @@ llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum, const Twine &BBName) { assert(!isa(TI) && "Cannot split critical edge from IndirectBrInst"); + assert(!isa(TI) && + "Cannot split critical edge from ReattachInst"); + + bool SplittingDetachContinue = + isa(TI) || (isDetachedRethrow(TI) && (1 == SuccNum)) || + (isa(TI) && ((1 == SuccNum) || (2 == SuccNum))); + if (SplittingDetachContinue) + assert((Options.SplitDetachContinue && Options.DT) && + "Cannot split critical continuation edge from a detach"); BasicBlock *TIBB = TI->getParent(); BasicBlock *DestBB = TI->getSuccessor(SuccNum); + // If we're splitting a detach-continue edge, get the associated reattaches. + SmallVector Reattaches; + if (SplittingDetachContinue) { + BasicBlockEdge DetachEdge(TIBB, TI->getSuccessor(0)); + for (BasicBlock *Pred : predecessors(DestBB)) + if (isa(Pred->getTerminator())) + if (Options.DT->dominates(DetachEdge, Pred)) + Reattaches.push_back(Pred); + } + // Splitting the critical edge to a pad block is non-trivial. Don't do // it in this generic function. if (DestBB->isEHPad()) return nullptr; @@ -184,6 +204,12 @@ llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum, // Branch to the new block, breaking the edge. TI->setSuccessor(SuccNum, NewBB); + // If we're splitting a detach-continue edge, redirect all appropriate + // reattach edges to branch to the new block + if (SplittingDetachContinue) + for (BasicBlock *RBB : Reattaches) + RBB->getTerminator()->setSuccessor(0, NewBB); + // If there are any PHI nodes in DestBB, we need to update them so that they // merge incoming values from NewBB instead of from TIBB. { @@ -203,6 +229,28 @@ llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum, BBIdx = PN->getBasicBlockIndex(TIBB); PN->setIncomingBlock(BBIdx, NewBB); } + + // Update the PHI node entries for the reattach predecessors as well. + if (SplittingDetachContinue) { + for (BasicBlock *RBB : Reattaches) { + unsigned BBIdx = 0; + for (BasicBlock::iterator I = DestBB->begin(); isa(I); ++I) { + // We no longer enter through RBB, now we come in through NewBB. + // Revector exactly one entry in the PHI node that used to come from + // TIBB to come from NewBB. + PHINode *PN = cast(I); + + // Reuse the previous value of BBIdx if it lines up. In cases where we + // have multiple phi nodes with *lots* of predecessors, this is a speed + // win because we don't have to scan the PHI looking for TIBB. This + // happens because the BB list of PHI nodes are usually in the same + // order. + if (PN->getIncomingBlock(BBIdx) != RBB) + BBIdx = PN->getBasicBlockIndex(RBB); + PN->removeIncomingValue(BBIdx); + } + } + } } // If there are any other edges from TIBB to DestBB, update those to go diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 0c45bd886af9d2e..b57e5e6d269a2ff 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -1399,6 +1399,20 @@ bool llvm::isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, isLibFuncEmittable(M, TLI, TheLibFunc); } +bool llvm::inferTapirTargetLibFuncAttributes(Function &F, + const TargetLibraryInfo &TLI) { + if (!TLI.isTapirTargetLibFunc(F)) + return false; + + bool Changed = false; + // FIXME: For now, we just set generic properties on Tapir-target library + // functions. + Changed |= setDoesNotFreeMemory(F); + Changed |= setWillReturn(F); + + return Changed; +} + bool llvm::hasFloatFn(const Module *M, const TargetLibraryInfo *TLI, Type *Ty, LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn) { switch (Ty->getTypeID()) { diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt index 1b811c7cebef9b6..98e388c278814e0 100644 --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -81,6 +81,9 @@ add_llvm_component_library(LLVMTransformUtils SplitModule.cpp StripNonLineTableDebugInfo.cpp SymbolRewriter.cpp + TapirUtils.cpp + TaskCanonicalize.cpp + TaskSimplify.cpp UnifyFunctionExitNodes.cpp UnifyLoopExits.cpp Utils.cpp diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index 47e3c03288d979d..2e671d59f4d4ade 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -82,6 +82,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, CodeInfo->ContainsCalls |= hasCalls; CodeInfo->ContainsMemProfMetadata |= hasMemProfMetadata; CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas; + CodeInfo->ContainsDetach |= isa(BB->getTerminator()); } return NewBB; } @@ -640,6 +641,7 @@ void PruningFunctionCloner::CloneBlock( CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas; CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && BB != &BB->getParent()->front(); + CodeInfo->ContainsDetach |= isa(BB->getTerminator()); } } @@ -651,6 +653,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, ValueToValueMapTy &VMap, bool ModuleLevelChanges, SmallVectorImpl &Returns, + SmallVectorImpl &Resumes, const char *NameSuffix, ClonedCodeInfo *CodeInfo) { assert(NameSuffix && "NameSuffix cannot be null!"); @@ -936,9 +939,12 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // because we can iteratively remove and merge returns above. for (Function::iterator I = cast(VMap[StartingBB])->getIterator(), E = NewFunc->end(); - I != E; ++I) + I != E; ++I) { if (ReturnInst *RI = dyn_cast(I->getTerminator())) Returns.push_back(RI); + if (ResumeInst *RI = dyn_cast(I->getTerminator())) + Resumes.push_back(RI); + } } /// This works exactly like CloneFunctionInto, @@ -951,9 +957,11 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, void llvm::CloneAndPruneFunctionInto( Function *NewFunc, const Function *OldFunc, ValueToValueMapTy &VMap, bool ModuleLevelChanges, SmallVectorImpl &Returns, + SmallVectorImpl &Resumes, const char *NameSuffix, ClonedCodeInfo *CodeInfo) { CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap, - ModuleLevelChanges, Returns, NameSuffix, CodeInfo); + ModuleLevelChanges, Returns, Resumes, NameSuffix, + CodeInfo); } /// Remaps instructions in \p Blocks using the mapping in \p VMap. diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 5bca5cf8ff91f7c..1783f81dd5bb9ed 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -919,6 +919,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, case Attribute::ReturnsTwice: case Attribute::Speculatable: case Attribute::StackAlignment: + case Attribute::Stealable: case Attribute::WillReturn: case Attribute::AllocKind: case Attribute::PresplitCoroutine: @@ -954,6 +955,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, case Attribute::SafeStack: case Attribute::ShadowCallStack: case Attribute::SanitizeAddress: + case Attribute::SanitizeCilk: case Attribute::SanitizeMemory: case Attribute::SanitizeNumericalStability: case Attribute::SanitizeThread: diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 68696789530f4a2..d309b1109780039 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -64,9 +64,12 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include #include @@ -593,6 +596,269 @@ static BasicBlock *HandleCallsInBlockInlinedThroughInvoke( return nullptr; } +// Helper method to check if the given UnwindEdge unwinds a taskframe, i.e., if +// it is terminated with a taskframe.resume intrinsic. +static bool isTaskFrameUnwind(const BasicBlock *UnwindEdge) { + return isTaskFrameResume(UnwindEdge->getTerminator()); +} + +static void splitTaskFrameEnds(Instruction *TFCreate) { + // Split taskframe.end that use TFCreate. + SmallVector TFEndToSplit; + for (User *U : TFCreate->users()) + if (IntrinsicInst *UI = dyn_cast(U)) + if (Intrinsic::taskframe_end == UI->getIntrinsicID()) + TFEndToSplit.push_back(UI); + + for (Instruction *TFEnd : TFEndToSplit) { + if (TFEnd != TFEnd->getParent()->getTerminator()->getPrevNode()) { + BasicBlock::iterator Iter = ++TFEnd->getIterator(); + SplitBlock(TFEnd->getParent(), &*Iter); + // Try to attach debug info to the new terminator after the taskframe.end + // call. + Instruction *SplitTerminator = TFEnd->getParent()->getTerminator(); + if (!SplitTerminator->getDebugLoc()) + SplitTerminator->setDebugLoc(TFEnd->getDebugLoc()); + Iter->getParent()->setName(TFEnd->getParent()->getName() + ".tfend"); + } + } +} + +// Recursively handle inlined tasks. +static void HandleInlinedTasksHelper( + SmallPtrSetImpl &BlocksToProcess, + BasicBlock *FirstNewBlock, BasicBlock *UnwindEdge, + BasicBlock *Unreachable, Value *CurrentTaskFrame, + SmallVectorImpl *ParentWorklist, + LandingPadInliningInfo &Invoke, + SmallPtrSetImpl &InlinedLPads) { + SmallVector DetachesToReplace; + SmallVector Worklist; + // TODO: See if we need a global Visited set over all recursive calls, i.e., + // to handle shared exception-handling blocks. + SmallPtrSet Visited; + Worklist.push_back(FirstNewBlock); + do { + BasicBlock *BB = Worklist.pop_back_val(); + // Skip blocks we've seen before + if (!Visited.insert(BB).second) + continue; + // Skip blocks not in the set to process. + if (!BlocksToProcess.count(BB)) + continue; + + if (Instruction *TFCreate = + FindTaskFrameCreateInBlock(BB, CurrentTaskFrame)) { + // Split the block at the taskframe.create, if necessary. + BasicBlock *NewBB; + if (TFCreate != &BB->front()) { + NewBB = SplitBlock(BB, TFCreate); + BlocksToProcess.insert(NewBB); + } else + NewBB = BB; + + // Split any blocks containing taskframe.end intrinsics that use + // TFCreate. + splitTaskFrameEnds(TFCreate); + + // Create an unwind edge for the taskframe. + BasicBlock *TaskFrameUnwindEdge = + CreateSubTaskUnwindEdge(Intrinsic::taskframe_resume, TFCreate, + UnwindEdge, Unreachable, TFCreate); + + // Recursively check all blocks + HandleInlinedTasksHelper(BlocksToProcess, NewBB, TaskFrameUnwindEdge, + Unreachable, TFCreate, &Worklist, Invoke, + InlinedLPads); + + // Remove the unwind edge for the taskframe if it is not needed. + if (pred_empty(TaskFrameUnwindEdge)) + TaskFrameUnwindEdge->eraseFromParent(); + continue; + } + + // Promote any calls in the block to invokes. + if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke( + BB, UnwindEdge)) { + // If this is the topmost invocation of HandleInlinedTasksHelper, update + // any PHI nodes in the exceptional block to indicate that there is now a + // new entry in them. + if (nullptr == ParentWorklist) + Invoke.addIncomingPHIValuesFor(NewBB); + BlocksToProcess.insert( + cast(NewBB->getTerminator())->getNormalDest()); + } + + // Forward any resumes that are remaining here. + if (ResumeInst *RI = dyn_cast(BB->getTerminator())) + Invoke.forwardResume(RI, InlinedLPads); + + // Ignore reattach terminators. + if (isa(BB->getTerminator()) || + isDetachedRethrow(BB->getTerminator())) + continue; + + // If we find a taskframe.end, add its successor to the parent search. + if (endsTaskFrame(BB, CurrentTaskFrame)) { + // We may not have a parent worklist, if inlining itself created + // the taskframe. + if (ParentWorklist) + ParentWorklist->push_back(BB->getSingleSuccessor()); + continue; + } + + // If we find a taskframe.resume terminator, add its successor to the parent + // search. + if (isTaskFrameResume(BB->getTerminator()) && ParentWorklist) { + assert(isTaskFrameUnwind(UnwindEdge) && + "Unexpected taskframe.resume, doesn't correspond to unwind edge"); + InvokeInst *II = cast(BB->getTerminator()); + + // We may not have a parent worklist, however, if inlining itself created + // the taskframe. + if (ParentWorklist) + ParentWorklist->push_back(II->getUnwindDest()); + continue; + } + + // Process a detach instruction specially. In particular, process the + // spawned task recursively. + if (DetachInst *DI = dyn_cast(BB->getTerminator())) { + if (!DI->hasUnwindDest()) { + // Create an unwind edge for the subtask, which is terminated with a + // detached-rethrow. + BasicBlock *SubTaskUnwindEdge = CreateSubTaskUnwindEdge( + Intrinsic::detached_rethrow, DI->getSyncRegion(), UnwindEdge, + Unreachable, DI); + + // Recursively check all blocks in the detached task. + HandleInlinedTasksHelper(BlocksToProcess, DI->getDetached(), + SubTaskUnwindEdge, Unreachable, + CurrentTaskFrame, &Worklist, Invoke, + InlinedLPads); + + // If the new unwind edge is not used, remove it. + if (pred_empty(SubTaskUnwindEdge)) + SubTaskUnwindEdge->eraseFromParent(); + else { + DetachesToReplace.push_back(DI); + // Update PHI nodes in the exceptional block to indicate that + // SubTaskUnwindEdge is a new entry in them. This should only have an + // effect for the topmost call to HandleInlinedTasksHelper. + Invoke.addIncomingPHIValuesFor(SubTaskUnwindEdge); + } + + } else if (Visited.insert(DI->getUnwindDest()).second) { + // If the detach-unwind isn't dead, add it to the worklist. + Worklist.push_back(DI->getUnwindDest()); + } + // Add the continuation to the worklist. + if (CurrentTaskFrame && isTaskFrameUnwind(UnwindEdge) && + (CurrentTaskFrame == getTaskFrameUsed(DI->getDetached()))) { + // This detach-continuation terminates the current taskframe, so push it + // onto the parent worklist. + assert(ParentWorklist && "Unexpected taskframe unwind edge"); + ParentWorklist->push_back(DI->getContinue()); + } else { + // We can process this detach-continuation directly, because it does not + // terminate the current taskframe. + Worklist.push_back(DI->getContinue()); + } + continue; + } + + // In the normal case, add all successors of BB to the worklist. + for (BasicBlock *Successor : successors(BB)) + Worklist.push_back(Successor); + + } while (!Worklist.empty()); + + // Replace detaches that now require unwind destinations. + while (!DetachesToReplace.empty()) { + DetachInst *DI = DetachesToReplace.pop_back_val(); + // If this is the topmost invocation of HandleInlinedTasksHelper, update any + // PHI nodes in the exceptional block to indicate that there is now a new + // entry in them. + if (nullptr == ParentWorklist) + Invoke.addIncomingPHIValuesFor(DI->getParent()); + ReplaceInstWithInst(DI, DetachInst::Create( + DI->getDetached(), DI->getContinue(), UnwindEdge, + DI->getSyncRegion())); + } +} + +static void HandleInlinedTasks( + SmallPtrSetImpl &BlocksToProcess, BasicBlock *FirstNewBlock, + Value *TFCreate, BasicBlock *UnwindEdge, LandingPadInliningInfo &Invoke, + SmallPtrSetImpl &InlinedLPads) { + Function *Caller = UnwindEdge->getParent(); + + // Create the normal return for the detached rethrow. + BasicBlock *UnreachableBlk = BasicBlock::Create( + Caller->getContext(), UnwindEdge->getName()+".unreachable", Caller); + + // Recursively handle inlined tasks. + HandleInlinedTasksHelper(BlocksToProcess, FirstNewBlock, UnwindEdge, + UnreachableBlk, TFCreate, nullptr, Invoke, + InlinedLPads); + + // Either finish the unreachable block or remove it, depending on whether it + // is used. + if (!pred_empty(UnreachableBlk)) { + IRBuilder<> Builder(UnreachableBlk); + Builder.CreateUnreachable(); + } else { + UnreachableBlk->eraseFromParent(); + } +} + +static void GetInlinedLPads(SmallPtrSetImpl &BlocksToProcess, + SmallPtrSetImpl &InlinedLPads) { + SmallVector Worklist; + SmallPtrSet Visited; + + // Push all blocks to process that are terminated by a resume onto the + // worklist. + for (BasicBlock *BB : BlocksToProcess) + if (isa(BB->getTerminator())) + Worklist.push_back(BB); + + // Traverse the blocks to process from the resumes going backwards (through + // predecessors). + while(!Worklist.empty()) { + BasicBlock *BB = Worklist.pop_back_val(); + // Skip blocks we've seen before + if (!Visited.insert(BB).second) + continue; + // Skip blocks not in the set to process. + if (!BlocksToProcess.count(BB)) + continue; + + // If BB is a landingpad... + if (BB->isLandingPad()) { + // Record BB's landingpad instruction. + InlinedLPads.insert(BB->getLandingPadInst()); + + // Add predecessors of BB to the worklist, skipping predecessors via a + // detached.rethrow or taskframe.resume. + for (BasicBlock *Predecessor : predecessors(BB)) + if (!isDetachedRethrow(Predecessor->getTerminator()) && + !isTaskFrameResume(Predecessor->getTerminator())) + Worklist.push_back(Predecessor); + + continue; + } + + // In the normal case, add predecessors of BB to the worklist, excluding + // predecessors via reattach, detached.rethrow, or taskframe.resume + for (BasicBlock *Predecessor : predecessors(BB)) + if (!isa(Predecessor->getTerminator()) && + !isDetachedRethrow(Predecessor->getTerminator()) && + !isTaskFrameResume(Predecessor->getTerminator())) + Worklist.push_back(Predecessor); + } +} + /// If we inlined an invoke site, we need to convert calls /// in the body of the inlined function into invokes. /// @@ -600,6 +866,7 @@ static BasicBlock *HandleCallsInBlockInlinedThroughInvoke( /// block of the inlined code (the last block is the end of the function), /// and InlineCodeInfo is information about the code that got inlined. static void HandleInlinedLandingPad(InvokeInst *II, BasicBlock *FirstNewBlock, + Value *TFCreate, ClonedCodeInfo &InlinedCodeInfo) { BasicBlock *InvokeDest = II->getUnwindDest(); @@ -610,6 +877,41 @@ static void HandleInlinedLandingPad(InvokeInst *II, BasicBlock *FirstNewBlock, // rewrite. LandingPadInliningInfo Invoke(II); + // Special processing is needed to inline a function that contains a task. + if (InlinedCodeInfo.ContainsDetach) { + // Get the set of blocks for the inlined function. + SmallPtrSet BlocksToProcess; + for (Function::iterator BB = FirstNewBlock->getIterator(), + E = Caller->end(); BB != E; ++BB) + BlocksToProcess.insert(&*BB); + + // Get all of the inlined landing pad instructions. + SmallPtrSet InlinedLPads; + GetInlinedLPads(BlocksToProcess, InlinedLPads); + + // Append the clauses from the outer landing pad instruction into the + // inlined landing pad instructions. + LandingPadInst *OuterLPad = Invoke.getLandingPadInst(); + for (LandingPadInst *InlinedLPad : InlinedLPads) { + unsigned OuterNum = OuterLPad->getNumClauses(); + InlinedLPad->reserveClauses(OuterNum); + for (unsigned OuterIdx = 0; OuterIdx != OuterNum; ++OuterIdx) + InlinedLPad->addClause(OuterLPad->getClause(OuterIdx)); + if (OuterLPad->isCleanup()) + InlinedLPad->setCleanup(true); + } + + // Process inlined subtasks. + HandleInlinedTasks(BlocksToProcess, FirstNewBlock, TFCreate, + Invoke.getOuterResumeDest(), Invoke, InlinedLPads); + // Now that everything is happy, we have one final detail. The PHI nodes in + // the exception destination block still have entries due to the original + // invoke instruction. Eliminate these entries (which might even delete the + // PHI node) now. + InvokeDest->removePredecessor(II->getParent()); + return; + } + // Get all of the inlined landing pad instructions. SmallPtrSet InlinedLPads; for (Function::iterator I = FirstNewBlock->getIterator(), E = Caller->end(); @@ -1686,10 +1988,11 @@ static Value *HandleByValArgument(Type *ByValType, Value *Arg, if (ByValAlignment) Alignment = std::max(Alignment, *ByValAlignment); + BasicBlock *NewCtx = GetDetachedCtx(TheCall->getParent()); AllocaInst *NewAlloca = new AllocaInst(ByValType, Arg->getType()->getPointerAddressSpace(), nullptr, Alignment, Arg->getName()); - NewAlloca->insertBefore(Caller->begin()->begin()); + NewAlloca->insertBefore(&*NewCtx->begin()); IFI.StaticAllocas.push_back(NewAlloca); // Uses of the argument in the function should use our new alloca @@ -2128,6 +2431,125 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, } } +static bool isTaskFrameCreate(const Instruction &I) { + if (const IntrinsicInst *II = dyn_cast(&I)) + return Intrinsic::taskframe_create == II->getIntrinsicID(); + return false; +} + +static BasicBlock *SplitResume(ResumeInst *RI, Intrinsic::ID TermFunc, + Value *Token, BasicBlock *Unreachable) { + Value *RIValue = RI->getValue(); + BasicBlock *OldBB = RI->getParent(); + Module *M = OldBB->getModule(); + + // Split the resume block at the resume. + BasicBlock *NewBB = SplitBlock(OldBB, RI); + + // Invoke the specified terminator function at the end of the old block. + InvokeInst *TermFuncInvoke = InvokeInst::Create( + Intrinsic::getDeclaration(M, TermFunc, { RIValue->getType() }), + Unreachable, NewBB, { Token, RIValue }); + ReplaceInstWithInst(OldBB->getTerminator(), TermFuncInvoke); + + // Insert a landingpad at the start of the new block. + IRBuilder<> Builder(RI); + LandingPadInst *LPad = Builder.CreateLandingPad(RIValue->getType(), 0, + RIValue->getName()); + LPad->setCleanup(true); + + // Replace the argument of the resume with the value of the new landingpad. + RI->setOperand(0, LPad); + + return NewBB; +} + +static void HandleInlinedResumeInTask(BasicBlock *EntryBlock, BasicBlock *Ctx, + ResumeInst *Resume, + BasicBlock *Unreachable) { + // If the DetachedBlock has no predecessor, then it is the entry of the + // function. There's nothing to do in this case, so simply return. + if (pred_empty(EntryBlock) && EntryBlock == Ctx) + return; + + BasicBlock *Parent = + (EntryBlock != Ctx ? Ctx : EntryBlock->getSinglePredecessor()); + Module *M = Parent->getModule(); + if (isTaskFrameCreate(EntryBlock->front())) { + Value *TaskFrame = &EntryBlock->front(); + if (InvokeInst *TFResume = getTaskFrameResume(TaskFrame)) { + BasicBlock *ResumeDest = TFResume->getUnwindDest(); + // Replace the resume with a taskframe.resume, whose unwind destination + // matches the unwind destination of the taskframe. + InvokeInst *NewTFResume = InvokeInst::Create( + Intrinsic::getDeclaration(M, Intrinsic::taskframe_resume, + {Resume->getValue()->getType()}), + Unreachable, ResumeDest, {TaskFrame, Resume->getValue()}); + ReplaceInstWithInst(Resume, NewTFResume); + + // Update PHI nodes in ResumeDest. + for (PHINode &PN : ResumeDest->phis()) + // Add an entry to the PHI node for the new predecessor block, + // NewTFResume->getParent(), using the same value as that from + // TFResume->getParent(). + PN.addIncoming(PN.getIncomingValueForBlock(TFResume->getParent()), + NewTFResume->getParent()); + + // No need to continue up the stack of contexts. + return; + } + + // Otherwise, split the resume to insert a novel invocation of + // taskframe.resume for this taskframe. + SplitResume(Resume, Intrinsic::taskframe_resume, TaskFrame, Unreachable); + + // Recursively handle parent contexts. + if (EntryBlock != Ctx) + HandleInlinedResumeInTask(Ctx, Ctx, Resume, Unreachable); + else { + BasicBlock *NewCtx = GetDetachedCtx(Parent); + HandleInlinedResumeInTask(NewCtx, NewCtx, Resume, Unreachable); + } + + } else { + assert(EntryBlock == Ctx && "Unexpected context for detached entry block."); + DetachInst *DI = cast(Parent->getTerminator()); + Value *SyncRegion = DI->getSyncRegion(); + + if (DI->hasUnwindDest()) { + // Replace the resume with a detached.rethrow, whose unwind destination + // matches the unwind destination of the detach. + BasicBlock *DetUnwind = DI->getUnwindDest(); + InvokeInst *NewDetRethrow = InvokeInst::Create( + Intrinsic::getDeclaration(M, Intrinsic::detached_rethrow, + {Resume->getValue()->getType()}), + Unreachable, DetUnwind, {SyncRegion, Resume->getValue()}); + ReplaceInstWithInst(Resume, NewDetRethrow); + + // Update PHI nodes in unwind dest. + for (PHINode &PN : DetUnwind->phis()) + // Add an entry to the PHI node for the new predecessor block, + // NewDetRethrow->getParent(), using the same value as that from Parent. + PN.addIncoming(PN.getIncomingValueForBlock(Parent), + NewDetRethrow->getParent()); + + // No need to continue up the stack of contexts. + return; + } + + // Insert an invocation of detached.rethrow before the resume. + BasicBlock *NewBB = SplitResume(Resume, Intrinsic::detached_rethrow, + SyncRegion, Unreachable); + + // Add NewBB as the unwind destination of DI. + ReplaceInstWithInst(DI, DetachInst::Create(EntryBlock, DI->getContinue(), + NewBB, SyncRegion)); + // Recursively handle parent contexts. + BasicBlock *NewCtx = GetDetachedCtx(Parent); + HandleInlinedResumeInTask(NewCtx, NewCtx, Resume, Unreachable); + } +} + /// This function inlines the called function into the basic block of the /// caller. This returns false if it is not possible to inline this call. /// The program is still in a well defined state if this occurs though. @@ -2235,14 +2657,44 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, ? Caller->getPersonalityFn()->stripPointerCasts() : nullptr; if (CalledPersonality) { + Triple T(Caller->getParent()->getTargetTriple()); if (!CallerPersonality) Caller->setPersonalityFn(CalledPersonality); - // If the personality functions match, then we can perform the - // inlining. Otherwise, we can't inline. - // TODO: This isn't 100% true. Some personality functions are proper - // supersets of others and can be used in place of the other. - else if (CalledPersonality != CallerPersonality) - return InlineResult::failure("incompatible personality"); + else if (CalledPersonality != CallerPersonality) { + // See if we want to replace CallerPersonality with the CalledPersonality, + // because CalledPersonality is a proper superset. + if (classifyEHPersonality(CallerPersonality) == + getDefaultEHPersonality(T)) + // The caller is using the default personality function. We assume + // CalledPersonality is a superset. + Caller->setPersonalityFn(CalledPersonality); + + else if (classifyEHPersonality(CalledPersonality) == + EHPersonality::Cilk_CXX && + classifyEHPersonality(CallerPersonality) == + EHPersonality::GNU_CXX) + // The Cilk personality is a superset of the caller's. + Caller->setPersonalityFn(CalledPersonality); + + // If the personality functions match, then we can perform the + // inlining. Otherwise, we can't inline. + // TODO: This isn't 100% true. Some personality functions are proper + // supersets of others and can be used in place of the other. + else { + EHPersonality CalledEHPersonality = + classifyEHPersonality(CalledPersonality); + // We can inline if: + // - CalledPersonality is the default personality, or + // - CallerPersonality is the Cilk personality and CalledPersonality is + // GNU_CXX. + // Otherwise, declare that we can't inline. + if (CalledEHPersonality != getDefaultEHPersonality(T) && + (classifyEHPersonality(CallerPersonality) != + EHPersonality::Cilk_CXX || + CalledEHPersonality != EHPersonality::GNU_CXX)) + return InlineResult::failure("incompatible personality"); + } + } } // We need to figure out which funclet the callsite was in so that we may @@ -2282,6 +2734,11 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, } } + // Canonicalize the caller by splitting blocks containing taskframe.create + // intrinsics. + if (splitTaskFrameCreateBlocks(*Caller)) + OrigBB = CB.getParent(); + // Determine if we are dealing with a call in an EHPad which does not unwind // to caller. bool EHPadForCallUnwindsLocally = false; @@ -2295,6 +2752,18 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, !isa(CallSiteUnwindDestToken); } + // Get the entry block of the detached context into which we're inlining. If + // we move allocas from the inlined code, we must move them to this block. + BasicBlock *DetachedCtxEntryBlock; + { + DetachedCtxEntryBlock = GetDetachedCtx(OrigBB); + assert(((&(Caller->getEntryBlock()) == DetachedCtxEntryBlock) || + pred_empty(DetachedCtxEntryBlock) || + DetachedCtxEntryBlock->getSinglePredecessor()) && + "Entry block of detached context has multiple predecessors."); + } + bool MayBeUnsyncedAtCall = mayBeUnsynced(OrigBB); + // Get an iterator to the last basic block in the function, which will have // the new function inlined after it. Function::iterator LastBlock = --Caller->end(); @@ -2302,6 +2771,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // Make sure to capture all of the return instructions from the cloned // function. SmallVector Returns; + SmallVector Resumes; ClonedCodeInfo InlinedFunctionInfo; Function::iterator FirstNewBlock; @@ -2366,8 +2836,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // (which can happen, e.g., because an argument was constant), but we'll be // happy with whatever the cloner can do. CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, - /*ModuleLevelChanges=*/false, Returns, ".i", - &InlinedFunctionInfo); + /*ModuleLevelChanges=*/false, Returns, Resumes, + ".i", &InlinedFunctionInfo); // Remember the first block that is newly cloned over. FirstNewBlock = LastBlock; ++FirstNewBlock; @@ -2505,7 +2975,9 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // calculate which instruction they should be inserted before. We insert the // instructions at the end of the current alloca list. { - BasicBlock::iterator InsertPoint = Caller->begin()->begin(); + BasicBlock::iterator InsertPoint = DetachedCtxEntryBlock->begin(); + if (isTaskFrameCreate(*InsertPoint)) + InsertPoint++; for (BasicBlock::iterator I = FirstNewBlock->begin(), E = FirstNewBlock->end(); I != E; ) { AllocaInst *AI = dyn_cast(I++); @@ -2536,9 +3008,31 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // Transfer all of the allocas over in a block. Using splice means // that the instructions aren't removed from the symbol table, then // reinserted. - I.setTailBit(true); - Caller->getEntryBlock().splice(InsertPoint, &*FirstNewBlock, - AI->getIterator(), I); + // KITSUNE FIXME: Is it safe to uncomment this? + // I.setTailBit(true); + DetachedCtxEntryBlock->splice(InsertPoint, &*FirstNewBlock, + AI->getIterator(), I); + } + + // Move any syncregion_start's into the entry basic block. Avoid moving + // syncregions if we'll need to insert a taskframe for this inlined call. + if (InlinedFunctionInfo.ContainsDetach && + !InlinedFunctionInfo.ContainsDynamicAllocas && !MayBeUnsyncedAtCall) { + for (BasicBlock::iterator I = FirstNewBlock->begin(), + E = FirstNewBlock->end(); I != E; ) { + IntrinsicInst *II = dyn_cast(I++); + if (!II) continue; + if (Intrinsic::syncregion_start != II->getIntrinsicID()) + continue; + + while (isa(I) && + Intrinsic::syncregion_start == + cast(I)->getIntrinsicID()) + ++I; + + DetachedCtxEntryBlock->splice(InsertPoint, &*FirstNewBlock, + II->getIterator(), I); + } } } @@ -2693,7 +3187,51 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // If the inlined code contained dynamic alloca instructions, wrap the inlined // code with llvm.stacksave/llvm.stackrestore intrinsics. - if (InlinedFunctionInfo.ContainsDynamicAllocas) { + CallInst *TFCreate = nullptr; + BasicBlock *TFEntryBlock = DetachedCtxEntryBlock; + if (InlinedFunctionInfo.ContainsDetach && + (InlinedFunctionInfo.ContainsDynamicAllocas || MayBeUnsyncedAtCall)) { + Module *M = Caller->getParent(); + // Get the two intrinsics we care about. + Function *TFCreateFn = + Intrinsic::getDeclaration(M, Intrinsic::taskframe_create); + + // Insert the llvm.taskframe.create. + TFCreate = IRBuilder<>(&*FirstNewBlock, FirstNewBlock->begin()) + .CreateCall(TFCreateFn, {}, "tf.i"); + TFCreate->setDebugLoc(CB.getDebugLoc()); + TFEntryBlock = &*FirstNewBlock; + + // If we're inlining an invoke, insert a taskframe.resume at the unwind + // destination of the invoke. + if (auto *II = dyn_cast(&CB)) { + BasicBlock *UnwindEdge = II->getUnwindDest(); + // Create the normal return for the detached rethrow. + BasicBlock *UnreachableBlk = BasicBlock::Create( + Caller->getContext(), UnwindEdge->getName()+".unreachable", Caller); + { // Add an unreachable instruction to the end of UnreachableBlk. + IRBuilder<> Builder(UnreachableBlk); + Builder.CreateUnreachable(); + } + + // Create an unwind edge for the taskframe. + BasicBlock *TaskFrameUnwindEdge = CreateSubTaskUnwindEdge( + Intrinsic::taskframe_resume, TFCreate, UnwindEdge, + UnreachableBlk, II); + + for (PHINode &PN : UnwindEdge->phis()) + PN.replaceIncomingBlockWith(II->getParent(), TaskFrameUnwindEdge); + + // Replace the unwind destination of the invoke with the unwind edge for + // the taskframe. + II->setUnwindDest(TaskFrameUnwindEdge); + } + } else if (InlinedFunctionInfo.ContainsDynamicAllocas) { + Module *M = Caller->getParent(); + // Get the two intrinsics we care about. + Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave); + Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore); + // Insert the llvm.stacksave. CallInst *SavedPtr = IRBuilder<>(&*FirstNewBlock, FirstNewBlock->begin()) .CreateStackSave("savedstack"); @@ -2719,10 +3257,51 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, BasicBlock *UnwindDest = II->getUnwindDest(); Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI(); if (isa(FirstNonPHI)) { - HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo); + HandleInlinedLandingPad(II, &*FirstNewBlock, TFCreate, + InlinedFunctionInfo); } else { HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo); } + } else if (!Resumes.empty() && (&Caller->getEntryBlock() != TFEntryBlock)) { + // If we inlined into a detached task, and the inlined function contains + // resumes, then we need to insert additional calls to EH intrinsics, + // specifically, detached.rethrow and taskframe.resume. + + // Create the normal (unreachable) return for the invocations of EH + // intrinsics. + BasicBlock *UnreachableBlk = BasicBlock::Create( + Caller->getContext(), CalledFunc->getName()+".unreachable", + Caller); + { // Add an unreachable instruction to the end of UnreachableBlk. + IRBuilder<> Builder(UnreachableBlk); + Builder.CreateUnreachable(); + } + + ResumeInst *Resume = Resumes[0]; + + // If multiple resumes were inlined, unify them, so that the detach + // instruction has a single unwind destination. + if (Resumes.size() > 1) { + // Create the unified resume block. + BasicBlock *UnifiedResume = BasicBlock::Create( + Caller->getContext(), "eh.unified.resume.i", Caller); + // Add a PHI node at the beginning of the block. + IRBuilder<> Builder(UnifiedResume); + PHINode *PN = + Builder.CreatePHI(Resume->getValue()->getType(), Resumes.size()); + for (ResumeInst *RI : Resumes) { + // Insert incoming values to the PHI node. + PN->addIncoming(RI->getValue(), RI->getParent()); + // Replace the resume with a branch to the unified block. + ReplaceInstWithInst(RI, BranchInst::Create(UnifiedResume)); + } + // Insert a resume instruction at the end of the block. + Resume = Builder.CreateResume(PN); + } + + // Handle resumes within the task. + HandleInlinedResumeInTask(TFEntryBlock, DetachedCtxEntryBlock, Resume, + UnreachableBlk); } // Update the lexical scopes of the new funclets and callsites. @@ -2946,6 +3525,14 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, IFI.CallerBFI->getBlockFreq(OrigBB)); } + // If we inserted a taskframe.create, insert a taskframe.end at the start of + // AfterCallBB. + if (TFCreate) { + Function *TFEndFn = Intrinsic::getDeclaration(Caller->getParent(), + Intrinsic::taskframe_end); + IRBuilder<>(&AfterCallBB->front()).CreateCall(TFEndFn, TFCreate); + } + // Change the branch that used to go to AfterCallBB to branch to the first // basic block of the inlined function. // diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp index ab1edf47d8db0a0..8994a8d529ccc76 100644 --- a/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -88,6 +88,8 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl &Worklist, // instructions within the same loops, computing the exit blocks is // expensive, and we're not mutating the loop structure. SmallDenseMap> LoopExitBlocks; + // Similarly, cache the Loop TaskExits across this loop. + SmallDenseMap> LoopTaskExits; while (!Worklist.empty()) { UsesToRewrite.clear(); @@ -105,6 +107,11 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl &Worklist, if (ExitBlocks.empty()) continue; + if (!LoopTaskExits.count(L)) + L->getTaskExits(LoopTaskExits[L]); + assert(LoopTaskExits.count(L)); + const SmallPtrSetImpl &TaskExits = LoopTaskExits[L]; + for (Use &U : make_early_inc_range(I->uses())) { Instruction *User = cast(U.getUser()); BasicBlock *UserBB = User->getParent(); @@ -121,7 +128,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl &Worklist, if (auto *PN = dyn_cast(User)) UserBB = PN->getIncomingBlock(U); - if (InstBB != UserBB && !L->contains(UserBB)) + if (InstBB != UserBB && !L->contains(UserBB) && !TaskExits.count(UserBB)) UsesToRewrite.push_back(&U); } @@ -178,7 +185,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl &Worklist, // If the exit block has a predecessor not within the loop, arrange for // the incoming value use corresponding to that predecessor to be // rewritten in terms of a different LCSSA PHI. - if (!L->contains(Pred)) + if (!L->contains(Pred) && !TaskExits.count(Pred)) UsesToRewrite.push_back( &PN->getOperandUse(PN->getOperandNumForIncomingValue( PN->getNumIncomingValues() - 1))); diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 7192efe3f16b9d2..bbc3ba9e42dfd06 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -77,6 +77,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include #include #include @@ -1115,6 +1116,14 @@ static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB, replaceUndefValuesInPhi(PN, IncomingValues); } +static bool BlockIsEntryOfTask(const BasicBlock *BB) { + if (const BasicBlock *PredBB = BB->getSinglePredecessor()) + if (const DetachInst *DI = dyn_cast(PredBB->getTerminator())) + if (DI->getDetached() == BB) + return true; + return false; +} + bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, DomTreeUpdater *DTU) { assert(BB != &BB->getParent()->getEntryBlock() && @@ -1158,6 +1167,10 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, // something like a loop pre-header (or rarely, a part of an irreducible CFG); // folding the branch isn't profitable in that case anyway. if (!Succ->getSinglePredecessor()) { + // If Succ has multiple predecessors and BB is the entry of a detached task, + // we can't fold it BB into Succ. + if (BlockIsEntryOfTask(BB)) + return false; BasicBlock::iterator BBI = BB->begin(); while (isa(*BBI)) { for (Use &U : BBI->uses()) { @@ -3164,8 +3177,24 @@ static bool markAliveBlocks(Function &F, Instruction *llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) { Instruction *TI = BB->getTerminator(); - if (auto *II = dyn_cast(TI)) + if (auto *II = dyn_cast(TI)) { + // If we're removing the unwind destination of a detached rethrow or + // taskframe resume, simply remove the intrinsic. + if (auto *Called = II->getCalledFunction()) { + if (Intrinsic::detached_rethrow == Called->getIntrinsicID() || + Intrinsic::taskframe_resume == Called->getIntrinsicID()) { + BranchInst *BI = BranchInst::Create(II->getNormalDest(), II); + BI->takeName(II); + BI->setDebugLoc(II->getDebugLoc()); + II->getUnwindDest()->removePredecessor(BB); + II->eraseFromParent(); + if (DTU) + DTU->applyUpdates({{DominatorTree::Delete, BB, II->getUnwindDest()}}); + return BI; + } + } return changeToCall(II, DTU); + } Instruction *NewTI; BasicBlock *UnwindDest; @@ -3182,6 +3211,10 @@ Instruction *llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) { NewTI = NewCatchSwitch; UnwindDest = CatchSwitch->getUnwindDest(); + } else if (auto *DI = dyn_cast(TI)) { + NewTI = DetachInst::Create(DI->getDetached(), DI->getContinue(), + DI->getSyncRegion(), DI); + UnwindDest = DI->getUnwindDest(); } else { llvm_unreachable("Could not find unwind successor"); } @@ -3233,6 +3266,73 @@ bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU, DeleteDeadBlocks(BlocksToRemove.takeVector(), DTU); + removeDeadDetachUnwinds(F, DTU, MSSAU); + + return Changed; +} + +// Recursively check the task starting at TaskEntry to find detached-rethrows +// for tasks that cannot throw. +static bool recursivelyCheckDetachedRethrows( + BasicBlock *TaskEntry, SmallPtrSetImpl &DeadDU) { + SmallVector Worklist; + SmallPtrSet Visited; + BasicBlock *BB = TaskEntry; + Worklist.push_back(BB); + Visited.insert(BB); + do { + BB = Worklist.pop_back_val(); + + // Ignore reattach terminators + if (isa(BB->getTerminator())) + continue; + + // Detached-rethrow terminators indicate that the parent detach has a live + // unwind. + if (isDetachedRethrow(BB->getTerminator())) + return true; + + if (DetachInst *DI = dyn_cast(BB->getTerminator())) { + if (DI->hasUnwindDest()) { + // Recursively check all blocks in the detached task. + if (!recursivelyCheckDetachedRethrows(DI->getDetached(), DeadDU)) + DeadDU.insert(DI); + else if (Visited.insert(DI->getUnwindDest()).second) + // If the detach-unwind isn't dead, add it to the worklist. + Worklist.push_back(DI->getUnwindDest()); + } + + // We don't have to check the detached task for a detach with no unwind + // destination, because those tasks will not throw any exception. + + // Add the continuation to the worklist. + if (Visited.insert(DI->getContinue()).second) + Worklist.push_back(DI->getContinue()); + } else { + for (BasicBlock *Successor : successors(BB)) + if (Visited.insert(Successor).second) + Worklist.push_back(Successor); + } + } while (!Worklist.empty()); + return false; +} + +bool llvm::removeDeadDetachUnwinds(Function &F, DomTreeUpdater *DTU, + MemorySSAUpdater *MSSAU) { + SmallPtrSet DeadDU; + // Recusirvely check all tasks for dead detach-unwinds. + recursivelyCheckDetachedRethrows(&F.front(), DeadDU); + bool Changed = false; + // Scan the detach instructions and remove any dead detach-unwind edges. + for (BasicBlock &BB : F) + if (DetachInst *DI = dyn_cast(BB.getTerminator())) + if (DeadDU.count(DI)) { + removeUnwindEdge(&BB, DTU); + Changed = true; + } + // If any dead detach-unwinds were removed, remove unreachable blocks. + if (Changed) + removeUnreachableBlocks(F, DTU, MSSAU); return Changed; } diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 04042e71a2b82ef..9034fce500d28fa 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/DebugInfo.h" @@ -65,6 +66,7 @@ class LoopRotate { DominatorTree *DT; ScalarEvolution *SE; MemorySSAUpdater *MSSAU; + TaskInfo *TaskI; const SimplifyQuery &SQ; bool RotationOnly; bool IsUtilMode; @@ -74,10 +76,10 @@ class LoopRotate { LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC, DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, - const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode, - bool PrepareForLTO) + TaskInfo *TaskI, const SimplifyQuery &SQ, bool RotationOnly, + bool IsUtilMode, bool PrepareForLTO) : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE), - MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly), + MSSAU(MSSAU), TaskI(TaskI), SQ(SQ), RotationOnly(RotationOnly), IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO) {} bool processLoop(Loop *L); @@ -918,6 +920,12 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); + if (TaskI && DT) + // Recompute task info. + // FIXME: Figure out a way to update task info that is less + // computationally wasteful. + TaskI->recalculate(*DT->getRoot()->getParent(), *DT); + LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump()); ++NumRotated; @@ -1047,6 +1055,12 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) { if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); + if (TaskI && DT) + // Recompute task info. + // FIXME: Figure out a way to update task info that is less + // computationally wasteful. + TaskI->recalculate(*DT->getRoot()->getParent(), *DT); + return true; } @@ -1072,6 +1086,12 @@ bool LoopRotate::processLoop(Loop *L) { if ((MadeChange || SimplifiedLatch) && LoopMD) L->setLoopID(LoopMD); + if ((MadeChange || SimplifiedLatch) && TaskI && DT) + // Recompute task info. + // FIXME: Figure out a way to update task info that is less computationally + // wasteful. + TaskI->recalculate(*DT->getRoot()->getParent(), *DT); + return MadeChange || SimplifiedLatch; } @@ -1080,10 +1100,11 @@ bool LoopRotate::processLoop(Loop *L) { bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC, DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, - const SimplifyQuery &SQ, bool RotationOnly = true, + TaskInfo *TI, const SimplifyQuery &SQ, + bool RotationOnly = true, unsigned Threshold = unsigned(-1), bool IsUtilMode = true, bool PrepareForLTO) { - LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly, + LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, TI, SQ, RotationOnly, IsUtilMode, PrepareForLTO); return LR.processLoop(L); } diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp index a764fef5749116b..8b46f7ba8a3d1b0 100644 --- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -55,6 +55,7 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" @@ -543,6 +544,14 @@ static bool simplifyOneLoop(Loop *L, SmallVectorImpl &Worklist, if (Preheader) Changed = true; } + // Ensure that the preheader is not terminated by a sync. + if (Preheader && isa(Preheader->getTerminator())) { + LLVM_DEBUG(dbgs() + << "LoopSimplify: Splitting sync-terminated preheader.\n"); + SplitEdge(Preheader, L->getHeader(), DT, LI, MSSAU); + Changed = true; + Preheader = L->getLoopPreheader(); + } // Next, check to make sure that all exit nodes of the loop only have // predecessors that are inside of the loop. This check guarantees that the @@ -768,6 +777,7 @@ namespace { AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added. AU.addPreserved(); AU.addPreserved(); + AU.addPreserved(); } /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees. @@ -797,6 +807,8 @@ bool LoopSimplify::runOnFunction(Function &F) { DominatorTree *DT = &getAnalysis().getDomTree(); auto *SEWP = getAnalysisIfAvailable(); ScalarEvolution *SE = SEWP ? &SEWP->getSE() : nullptr; + auto *TIWP = getAnalysisIfAvailable(); + TaskInfo *TI = TIWP ? &TIWP->getTaskInfo() : nullptr; AssumptionCache *AC = &getAnalysis().getAssumptionCache(F); MemorySSA *MSSA = nullptr; @@ -820,6 +832,12 @@ bool LoopSimplify::runOnFunction(Function &F) { assert(InLCSSA && "LCSSA is broken after loop-simplify."); } #endif + if (Changed && TI) + // Update TaskInfo manually using the updated DT. + // + // FIXME: Recalculating TaskInfo for the whole function is wasteful. + // Optimize this routine in the future. + TI->recalculate(F, *DT); return Changed; } @@ -829,6 +847,7 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F, LoopInfo *LI = &AM.getResult(F); DominatorTree *DT = &AM.getResult(F); ScalarEvolution *SE = AM.getCachedResult(F); + TaskInfo *TI = AM.getCachedResult(F); AssumptionCache *AC = &AM.getResult(F); auto *MSSAAnalysis = AM.getCachedResult(F); std::unique_ptr MSSAU; @@ -847,6 +866,13 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F, if (!Changed) return PreservedAnalyses::all(); + if (Changed && TI) + // Update TaskInfo manually using the updated DT. + // + // FIXME: Recalculating TaskInfo for the whole function is wasteful. + // Optimize this routine in the future. + TI->recalculate(F, *DT); + PreservedAnalyses PA; PA.preserve(); PA.preserve(); @@ -854,6 +880,7 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F, PA.preserve(); if (MSSAAnalysis) PA.preserve(); + PA.preserve(); // BPI maps conditional terminators to probabilities, LoopSimplify can insert // blocks, but it does so only by splitting existing blocks and edges. This // results in the interesting property that all new terminators inserted are diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index a0406111ecbf3bb..91c18563646582a 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -34,6 +34,7 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -64,6 +65,7 @@ #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include @@ -439,6 +441,111 @@ static bool canHaveUnrollRemainder(const Loop *L) { return true; } +namespace llvm { +// Wrapper class for GraphTraits to examine task exits of a loop. +template struct TaskExitGraph { + const GraphType &Graph; + + inline TaskExitGraph(const GraphType &G) : Graph(G) {} +}; + +// GraphTraits to examine task exits of a loop, to support using the post_order +// iterator to examine the task exits. +template <> struct GraphTraits> { + using NodeRef = BasicBlock *; + + struct TaskExitFilter { + NodeRef TaskExitPred = nullptr; + TaskExitFilter(NodeRef TaskExit) : TaskExitPred(TaskExit) {} + bool operator()(NodeRef N) const { + return !isDetachedRethrow(TaskExitPred->getTerminator()) && + !isTaskFrameResume(TaskExitPred->getTerminator()); + } + }; + + using ChildIteratorType = filter_iterator; + + static NodeRef getEntryNode(TaskExitGraph G) { return G.Graph; } + static ChildIteratorType child_begin(NodeRef N) { + return make_filter_range(successors(N), TaskExitFilter(N)).begin(); + } + static ChildIteratorType child_end(NodeRef N) { + return make_filter_range(successors(N), TaskExitFilter(N)).end(); + } +}; +} // namespace llvm + +// Clone task-exit blocks that are effectively part of the loop but don't appear +// to be based on standard loop analysis. +static void handleTaskExits( + SmallPtrSetImpl &TaskExits, + SmallPtrSetImpl &TaskExitSrcs, unsigned It, Loop *L, + BasicBlock *Header, BasicBlock *BBInsertPt, LoopInfo *LI, + NewLoopsMap &NewLoops, SmallSetVector &LoopsToSimplify, + ValueToValueMapTy &LastValueMap, SmallVectorImpl &NewBlocks, + std::vector &UnrolledLoopBlocks, DominatorTree *DT) { + // Get the TaskExits in reverse post order. Using post_order here seems + // necessary to ensure the custom filter for processing task exits is used. + SmallVector TaskExitsRPO; + for (BasicBlock *TEStart : TaskExitSrcs) + for (BasicBlock *BB : post_order>((TEStart))) + TaskExitsRPO.push_back(BB); + + if (TaskExitsRPO.empty()) + // No task exits to handle. + return; + + // Process the task exits similarly to loop blocks. + auto BlockInsertPt = std::next(BBInsertPt->getIterator()); + for (BasicBlock *BB : reverse(TaskExitsRPO)) { + ValueToValueMapTy VMap; + BasicBlock *New = CloneBasicBlock(BB, VMap, "." + Twine(It)); + Header->getParent()->insert(BlockInsertPt, New); + + assert(BB != Header && "Header should not be a task exit"); + // Tell LI about New. + if (LI->getLoopFor(BB)) { + const Loop *OldLoop = addClonedBlockToLoopInfo(BB, New, LI, NewLoops); + if (OldLoop) + LoopsToSimplify.insert(NewLoops[OldLoop]); + } + + // Update our running map of newest clones + LastValueMap[BB] = New; + for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); + VI != VE; ++VI) + LastValueMap[VI->first] = VI->second; + + // Add phi entries for newly created values to all exit blocks. + for (BasicBlock *Succ : successors(BB)) { + if (L->contains(Succ) || TaskExits.count(Succ)) + continue; + for (PHINode &PHI : Succ->phis()) { + Value *Incoming = PHI.getIncomingValueForBlock(BB); + ValueToValueMapTy::iterator It = LastValueMap.find(Incoming); + if (It != LastValueMap.end()) + Incoming = It->second; + PHI.addIncoming(Incoming, New); + } + } + + NewBlocks.push_back(New); + UnrolledLoopBlocks.push_back(New); + + // Update DomTree: since we just copy the loop body, and each copy has a + // dedicated entry block (copy of the header block), this header's copy + // dominates all copied blocks. That means, dominance relations in the + // copied body are the same as in the original body. + if (DT) { + auto BBDomNode = DT->getNode(BB); + auto BBIDom = BBDomNode->getIDom(); + BasicBlock *OriginalBBIDom = BBIDom->getBlock(); + DT->addNewBlock( + New, cast(LastValueMap[cast(OriginalBBIDom)])); + } + } +} + /// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling /// can only fail when the loop's latch block is not terminated by a conditional /// branch instruction. However, if the trip count (and multiple) are not known, @@ -460,7 +567,8 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, - bool PreserveLCSSA, Loop **RemainderLoop, AAResults *AA) { + bool PreserveLCSSA, Loop **RemainderLoop, AAResults *AA, + TaskInfo *TI) { assert(DT && "DomTree is required"); if (!L->getLoopPreheader()) { @@ -549,6 +657,10 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // of the unrolled body exits. const bool CompletelyUnroll = ULO.Count == MaxTripCount; + // Disallow partial unrolling of Tapir loops. + if (getTaskIfTapirLoop(L, TI) && !CompletelyUnroll) + return LoopUnrollResult::Unmodified; + const bool PreserveOnlyFirst = CompletelyUnroll && MaxOrZero; // There's no point in performing runtime unrolling if this unroll count @@ -678,7 +790,12 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); + SmallPtrSet TaskExits; + L->getTaskExits(TaskExits); + std::vector UnrolledLoopBlocks = L->getBlocks(); + UnrolledLoopBlocks.insert(UnrolledLoopBlocks.end(), TaskExits.begin(), + TaskExits.end()); // Loop Unrolling might create new loops. While we do preserve LoopInfo, we // might break loop-simplified form for these loops (as they, e.g., would @@ -708,13 +825,14 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // Identify what noalias metadata is inside the loop: if it is inside the // loop, the associated metadata must be cloned for each iteration. SmallVector LoopLocalNoAliasDeclScopes; - identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes); + identifyNoAliasScopesToClone(UnrolledLoopBlocks, LoopLocalNoAliasDeclScopes); // We place the unrolled iterations immediately after the original loop // latch. This is a reasonable default placement if we don't have block // frequencies, and if we do, well the layout will be adjusted later. auto BlockInsertPt = std::next(LatchBlock->getIterator()); for (unsigned It = 1; It != ULO.Count; ++It) { + SmallPtrSet TaskExitSrcs; SmallVector NewBlocks; SmallDenseMap NewLoops; NewLoops[L] = L; @@ -764,6 +882,14 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, for (BasicBlock *Succ : successors(*BB)) { if (L->contains(Succ)) continue; + if (TaskExits.count(Succ)) { + if (llvm::none_of(predecessors(Succ), + [&TaskExits](const BasicBlock *B) { + return TaskExits.count(B); + })) + TaskExitSrcs.insert(Succ); + continue; + } for (PHINode &PHI : Succ->phis()) { Value *Incoming = PHI.getIncomingValueForBlock(*BB); ValueToValueMapTy::iterator It = LastValueMap.find(Incoming); @@ -804,6 +930,12 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, } } + // Handle task-exit blocks from this loop similarly to ordinary loop-body + // blocks. + handleTaskExits(TaskExits, TaskExitSrcs, It, L, Header, Latches.back(), LI, + NewLoops, LoopsToSimplify, LastValueMap, NewBlocks, + UnrolledLoopBlocks, DT); + // Remap all instructions in the most recent iteration remapInstructionsInBlocks(NewBlocks, LastValueMap); for (BasicBlock *NewBlock : NewBlocks) @@ -1079,6 +1211,12 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, simplifyLoop(SubLoop, DT, LI, SE, AC, nullptr, PreserveLCSSA); } + // Update TaskInfo manually using the updated DT. + if (TI) + // FIXME: Recalculating TaskInfo for the whole function is wasteful. + // Optimize this routine in the future. + TI->recalculate(*Header->getParent(), *DT); + return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled : LoopUnrollResult::PartiallyUnrolled; } diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 56aa96e550d9c3c..f2b7f3149e107b4 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -1026,7 +1026,7 @@ bool llvm::UnrollRuntimeLoopRemainder( assert(!getLoopConvergenceHeart(L) && "A loop with a convergence heart does not allow runtime unrolling."); UnrollResult = UnrollLoop(remainderLoop, ULO, LI, SE, DT, AC, TTI, - /*ORE*/ nullptr, PreserveLCSSA); + /*ORE*/ nullptr, PreserveLCSSA, /*TI*/ nullptr); } if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled) diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 0abf6d77496dcd9..e389187ae554c23 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -30,6 +30,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" @@ -45,6 +46,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Utils/TapirUtils.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -54,11 +56,60 @@ using namespace llvm::PatternMatch; static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced"; static const char *LLVMLoopDisableLICM = "llvm.licm.disable"; +static void GetTaskExits(BasicBlock *TaskEntry, Loop *L, + SmallPtrSetImpl &TaskExits) { + // Traverse the CFG to find the exit blocks from SubT. + SmallVector Worklist; + SmallPtrSet Visited; + Worklist.push_back(TaskEntry); + while (!Worklist.empty()) { + BasicBlock *BB = Worklist.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + // Record any block found in the task that is not contained in the loop + if (!L->contains(BB)) + TaskExits.insert(BB); + + // Stop the CFG traversal at any reattach or detached.rethrow + if (isa(BB->getTerminator()) || + isDetachedRethrow(BB->getTerminator())) + continue; + + // If we encounter a detach, only add its continuation and unwind + // destination + if (DetachInst *DI = dyn_cast(BB->getTerminator())) { + Worklist.push_back(DI->getContinue()); + if (DI->hasUnwindDest()) + Worklist.push_back(DI->getUnwindDest()); + continue; + } + + // For all other basic blocks, traverse all successors + for (BasicBlock *Succ : successors(BB)) + Worklist.push_back(Succ); + } +} + bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA) { bool Changed = false; + SmallPtrSet TaskExits; + { + SmallVector TaskEntriesToCheck; + for (auto *BB : L->blocks()) + if (DetachInst *DI = dyn_cast(BB->getTerminator())) + if (DI->hasUnwindDest()) + if (!L->contains(DI->getUnwindDest())) + TaskEntriesToCheck.push_back(DI->getDetached()); + + // For all tasks to check, get the loop exits that are in the task. + for (BasicBlock *TaskEntry : TaskEntriesToCheck) + GetTaskExits(TaskEntry, L, TaskExits); + } + // We re-use a vector for the in-loop predecesosrs. SmallVector InLoopPredecessors; @@ -71,7 +122,7 @@ bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI, // keep track of the in-loop predecessors. bool IsDedicatedExit = true; for (auto *PredBB : predecessors(BB)) - if (L->contains(PredBB)) { + if (L->contains(PredBB) || TaskExits.count(PredBB)) { if (isa(PredBB->getTerminator())) // We cannot rewrite exiting edges from an indirectbr. return false; @@ -106,7 +157,23 @@ bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI, for (auto *BB : L->blocks()) for (auto *SuccBB : successors(BB)) { // We're looking for exit blocks so skip in-loop successors. - if (L->contains(SuccBB)) + if (L->contains(SuccBB) || TaskExits.count(SuccBB) || + isTapirPlaceholderSuccessor(SuccBB)) + continue; + + // Visit each exit block exactly once. + if (!Visited.insert(SuccBB).second) + continue; + + Changed |= RewriteExit(SuccBB); + } + + // Visit exits from tasks within the loop as well. + for (auto *BB : TaskExits) + for (auto *SuccBB : successors(BB)) { + // We're looking for exit blocks so skip in-loop successors. + if (L->contains(SuccBB) || TaskExits.count(SuccBB) || + isTapirPlaceholderSuccessor(SuccBB)) continue; // Visit each exit block exactly once. @@ -174,6 +241,8 @@ void llvm::getLoopAnalysisUsage(AnalysisUsage &AU) { AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); // FIXME: When all loop passes preserve MemorySSA, it can be required and // preserved here instead of the individual handling in each pass. } @@ -196,6 +265,7 @@ void llvm::initializeLoopPassPass(PassRegistry &Registry) { INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) + INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) } @@ -444,6 +514,29 @@ TransformationMode llvm::hasLICMVersioningTransformation(const Loop *L) { return TM_Unspecified; } +TransformationMode llvm::hasLoopStripmineTransformation(const Loop *L) { + if (getBooleanLoopAttribute(L, "tapir.loop.stripmine.disable")) + return TM_Disable; + + if (getBooleanLoopAttribute(L, "tapir.loop.stripmine.enable")) + return TM_ForcedByUser; + + return TM_Unspecified; +} + +TransformationMode llvm::hasLoopSpawningTransformation(const Loop *L) { + TapirLoopHints Hints(L); + + switch (Hints.getStrategy()) { + case TapirLoopHints::ST_DAC: { + return TM_ForcedByUser; + } case TapirLoopHints::ST_SEQ: + return TM_Disable; + default: + return TM_Unspecified; + } +} + /// Does a BFS from a given node to all of its children inside a given loop. /// The returned vector of nodes includes the starting point. SmallVector @@ -481,7 +574,7 @@ bool llvm::isAlmostDeadIV(PHINode *PN, BasicBlock *LatchBlock, Value *Cond) { void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, - LoopInfo *LI, MemorySSA *MSSA) { + LoopInfo *LI, TaskInfo *TI, MemorySSA *MSSA) { assert((!DT || L->isLCSSAForm(*DT)) && "Expected LCSSA!"); auto *Preheader = L->getLoopPreheader(); assert(Preheader && "Preheader should exist!"); @@ -719,6 +812,12 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, } LI->destroy(L); } + + if (TI && DT) + // Recompute task info. + // FIXME: Figure out a way to update task info that is less computationally + // wasteful. + TI->recalculate(*DT->getRoot()->getParent(), *DT); } void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE, diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index c43c92a6b4d5d3f..b22423fed0136f7 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/MDBuilder.h" diff --git a/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/llvm/lib/Transforms/Utils/Mem2Reg.cpp index 5ad7aeb463ecb20..f792746a524482c 100644 --- a/llvm/lib/Transforms/Utils/Mem2Reg.cpp +++ b/llvm/lib/Transforms/Utils/Mem2Reg.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Utils/Mem2Reg.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -31,27 +32,44 @@ using namespace llvm; #define DEBUG_TYPE "mem2reg" STATISTIC(NumPromoted, "Number of alloca's promoted"); +STATISTIC(NumNotParallelPromotable, "Number of alloca's not promotable due to " + "Tapir instructions"); static bool promoteMemoryToRegister(Function &F, DominatorTree &DT, - AssumptionCache &AC) { + AssumptionCache &AC, TaskInfo &TI) { std::vector Allocas; - BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function bool Changed = false; + // Scan the function to get its entry block and all entry blocks of detached + // CFG's. We can perform this scan for entry blocks once for the function, + // because this pass preserves the CFG. + SmallVector EntryBlocks; + for (Task *T : depth_first(TI.getRootTask())) { + EntryBlocks.push_back(T->getEntry()); + if (Value *TaskFrame = T->getTaskFrameUsed()) + EntryBlocks.push_back(cast(TaskFrame)->getParent()); + } + while (true) { Allocas.clear(); // Find allocas that are safe to promote, by looking at all instructions in // the entry node - for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) - if (AllocaInst *AI = dyn_cast(I)) // Is it an alloca? - if (isAllocaPromotable(AI)) - Allocas.push_back(AI); + for (BasicBlock *BB : EntryBlocks) + for (BasicBlock::iterator I = BB->begin(), E = --BB->end(); I != E; ++I) + if (AllocaInst *AI = dyn_cast(I)) { // Is it an alloca? + if (isAllocaPromotable(AI)) { + if (TI.isAllocaParallelPromotable(AI)) + Allocas.push_back(AI); + else + ++NumNotParallelPromotable; + } + } if (Allocas.empty()) break; - PromoteMemToReg(Allocas, DT, &AC); + PromoteMemToReg(Allocas, DT, &AC, &TI); NumPromoted += Allocas.size(); Changed = true; } @@ -61,7 +79,8 @@ static bool promoteMemoryToRegister(Function &F, DominatorTree &DT, PreservedAnalyses PromotePass::run(Function &F, FunctionAnalysisManager &AM) { auto &DT = AM.getResult(F); auto &AC = AM.getResult(F); - if (!promoteMemoryToRegister(F, DT, AC)) + auto &TI = AM.getResult(F); + if (!promoteMemoryToRegister(F, DT, AC, TI)) return PreservedAnalyses::all(); PreservedAnalyses PA; @@ -88,12 +107,14 @@ struct PromoteLegacyPass : public FunctionPass { DominatorTree &DT = getAnalysis().getDomTree(); AssumptionCache &AC = getAnalysis().getAssumptionCache(F); - return promoteMemoryToRegister(F, DT, AC); + TaskInfo &TI = getAnalysis().getTaskInfo(); + return promoteMemoryToRegister(F, DT, AC, TI); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.setPreservesCFG(); } }; @@ -107,6 +128,7 @@ INITIALIZE_PASS_BEGIN(PromoteLegacyPass, "mem2reg", "Promote Memory to " false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) INITIALIZE_PASS_END(PromoteLegacyPass, "mem2reg", "Promote Memory to Register", false, false) diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 546a6cd56b25080..2f7f5c23f7f5ca2 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/IteratedDominanceFrontier.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -250,12 +251,11 @@ struct AllocaInfo { // Remember the basic blocks which define new values for the alloca DefiningBlocks.push_back(SI->getParent()); OnlyStore = SI; - } else { - LoadInst *LI = cast(User); + } else if (LoadInst *LI = dyn_cast(User)) { // Otherwise it must be a load instruction, keep track of variable // reads. UsingBlocks.push_back(LI->getParent()); - } + } else continue; if (OnlyUsedInOneBlock) { if (!OnlyBlock) @@ -352,6 +352,8 @@ struct PromoteMem2Reg { /// A cache of @llvm.assume intrinsics used by SimplifyInstruction. AssumptionCache *AC; + TaskInfo *TI; + const SimplifyQuery SQ; /// Reverse mapping of Allocas. @@ -398,11 +400,10 @@ struct PromoteMem2Reg { public: PromoteMem2Reg(ArrayRef Allocas, DominatorTree &DT, - AssumptionCache *AC) + AssumptionCache *AC, TaskInfo *TI) : Allocas(Allocas.begin(), Allocas.end()), DT(DT), DIB(*DT.getRoot()->getParent()->getParent(), /*AllowUnresolved*/ false), - AC(AC), SQ(DT.getRoot()->getDataLayout(), - nullptr, &DT, AC) {} + AC(AC), TI(TI), SQ(DT.getRoot()->getDataLayout(), nullptr, &DT, AC) {} void run(); @@ -754,6 +755,8 @@ void PromoteMem2Reg::run() { AllocaInst *AI = Allocas[AllocaNum]; assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!"); + assert((!TI || TI->isAllocaParallelPromotable(AI)) && + "Cannot promote non-promotable alloca in function with detach!"); assert(AI->getParent()->getParent() == &F && "All allocas should be in the same function, which is same as DF!"); @@ -822,18 +825,26 @@ void PromoteMem2Reg::run() { // to uses. SmallPtrSet LiveInBlocks; ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks); + // Filter out live-in blocks that are not dominated by the alloca. + if (AI->getParent() != DT.getRoot()) { + SmallVector LiveInToRemove; + for (BasicBlock *LiveIn : LiveInBlocks) + if (!DT.dominates(AI->getParent(), LiveIn)) + LiveInToRemove.push_back(LiveIn); + for (BasicBlock *ToRemove : LiveInToRemove) + LiveInBlocks.erase(ToRemove); + } - // At this point, we're committed to promoting the alloca using IDF's, and - // the standard SSA construction algorithm. Determine which blocks need phi - // nodes and see if we can optimize out some work by avoiding insertion of - // dead phi nodes. + // Determine which blocks need PHI nodes and see if we can optimize out some + // work by avoiding insertion of dead phi nodes. IDF.setLiveInBlocks(LiveInBlocks); IDF.setDefiningBlocks(DefBlocks); SmallVector PHIBlocks; IDF.calculate(PHIBlocks); - llvm::sort(PHIBlocks, [this](BasicBlock *A, BasicBlock *B) { - return BBNumbers.find(A)->second < BBNumbers.find(B)->second; - }); + if (PHIBlocks.size() > 1) + llvm::sort(PHIBlocks, [this](BasicBlock *A, BasicBlock *B) { + return BBNumbers.find(A)->second < BBNumbers.find(B)->second; + }); unsigned CurrentVersion = 0; for (BasicBlock *BB : PHIBlocks) @@ -926,6 +937,32 @@ void PromoteMem2Reg::run() { } } + // Check if a PHI is inserted at a task-continue block. + { + bool badPhi = false; + for (DenseMap, PHINode *>::iterator + I = NewPhiNodes.begin(), + E = NewPhiNodes.end(); + I != E; ++I) { + PHINode *PN = I->second; + BasicBlock *BB = PN->getParent(); + // Only need to check once per block + if (&BB->front() != PN) + continue; + + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); + PI != E; ++PI) { + BasicBlock *P = *PI; + if (isa(P->getTerminator())) { + LLVM_DEBUG(dbgs() << "Illegal PHI inserted in block " << BB->getName() + << "\n"); + badPhi = true; + } + } + } + assert(!badPhi && "PromoteMem2Reg inserted illegal phi."); + } + // At this point, the renamer has added entries to PHI nodes for all reachable // code. Unfortunately, there may be unreachable blocks which the renamer // hasn't traversed. If this is the case, the PHI nodes may not @@ -1238,10 +1275,10 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred, } void llvm::PromoteMemToReg(ArrayRef Allocas, DominatorTree &DT, - AssumptionCache *AC) { + AssumptionCache *AC, TaskInfo *TI) { // If there is nothing to do, bail out... if (Allocas.empty()) return; - PromoteMem2Reg(Allocas, DT, AC).run(); + PromoteMem2Reg(Allocas, DT, AC, TI).run(); } diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 2336466a25a178a..0b6885cfa9fe91c 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -646,6 +646,14 @@ class SCCPInstVisitor : public InstVisitor { void visitReturnInst(ReturnInst &I); void visitTerminator(Instruction &TI); + void visitReattachInst(ReattachInst &I) { + markOverdefined(&I); + visitTerminator(I); + } + void visitSyncInst(SyncInst &I) { + markOverdefined(&I); + visitTerminator(I); + } void visitCastInst(CastInst &I); void visitSelectInst(SelectInst &I); @@ -1158,6 +1166,19 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI, return; } + // In case of callbr, we pessimistically assume that all successors are + // feasible. + if (isa(&TI)) { + Succs.assign(TI.getNumSuccessors(), true); + return; + } + + // All destinations of a Tapir instruction are assumed to be feasible. + if (isa(&TI) || isa(&TI) || isa(&TI)) { + Succs.assign(TI.getNumSuccessors(), true); + return; + } + LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n'); llvm_unreachable("SCCP: Don't know how to handle this terminator!"); } diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 7fd3e51e141f303..32f59669b5b84f2 100644 --- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -43,11 +43,18 @@ static AvailableValsTy &getAvailableVals(void *AV) { return *static_cast(AV); } +typedef DenseMap ValIsDetachedTy; +static ValIsDetachedTy &getValIsDetached(void *VID) { + return *static_cast(VID); +} + SSAUpdater::SSAUpdater(SmallVectorImpl *NewPHI) : InsertedPHIs(NewPHI) {} SSAUpdater::~SSAUpdater() { delete static_cast(AV); + if (VID) + delete static_cast(VID); } void SSAUpdater::Initialize(Type *Ty, StringRef Name) { @@ -55,6 +62,10 @@ void SSAUpdater::Initialize(Type *Ty, StringRef Name) { AV = new AvailableValsTy(); else getAvailableVals(AV).clear(); + if (!VID) + VID = new ValIsDetachedTy(); + else + getValIsDetached(VID).clear(); ProtoType = Ty; ProtoName = std::string(Name); } @@ -105,6 +116,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { // predecessor. SmallVector, 8> PredValues; Value *SingularValue = nullptr; + SmallVector DetachPreds, ReattachPreds; // We can get our predecessor info by walking the pred_iterator list, but it // is relatively slow. If we already have PHI nodes in this block, walk one @@ -113,6 +125,12 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { for (unsigned i = 0, e = SomePhi->getNumIncomingValues(); i != e; ++i) { BasicBlock *PredBB = SomePhi->getIncomingBlock(i); Value *PredVal = GetValueAtEndOfBlock(PredBB); + if (isa(PredBB->getTerminator())) { + ReattachPreds.push_back(PredBB); + continue; + } + if (isa(PredBB->getTerminator())) + DetachPreds.push_back(PredBB); PredValues.push_back(std::make_pair(PredBB, PredVal)); // Compute SingularValue. @@ -125,6 +143,12 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { bool isFirstPred = true; for (BasicBlock *PredBB : predecessors(BB)) { Value *PredVal = GetValueAtEndOfBlock(PredBB); + if (isa(PredBB->getTerminator())) { + ReattachPreds.push_back(PredBB); + continue; + } + if (isa(PredBB->getTerminator())) + DetachPreds.push_back(PredBB); PredValues.push_back(std::make_pair(PredBB, PredVal)); // Compute SingularValue. @@ -135,6 +159,33 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { SingularValue = nullptr; } } + // Record any values we discover whose definitions occur in detached blocks. + if (!ReattachPreds.empty()) { + assert(!DetachPreds.empty() && + "Block has reattach predecessor but no detached predecessor."); + SmallVector, 8> DetachPredValues; + for (BasicBlock *DetachPred : DetachPreds) { + Value *DetachVal = GetValueAtEndOfBlock(DetachPred); + DetachPredValues.push_back(std::make_pair(DetachPred, DetachVal)); + } + for (BasicBlock *ReattachPred : ReattachPreds) { + Value *ReattachVal = GetValueAtEndOfBlock(ReattachPred); + bool FoundMatchingDetach = false; + for (std::pair DetachPredVal : DetachPredValues) { + if (DetachPredVal.second == ReattachVal) { + FoundMatchingDetach = true; + PredValues.push_back(std::make_pair(ReattachPred, ReattachVal)); + break; + } + } + if (!FoundMatchingDetach) { + SingularValue = nullptr; + getValIsDetached(VID)[BB] = true; + PredValues.push_back(std::make_pair( + ReattachPred, UndefValue::get(ReattachVal->getType()))); + } + } + } // If there are no predecessors, just return poison. if (PredValues.empty()) @@ -185,6 +236,10 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { return InsertedPHI; } +bool SSAUpdater::GetValueIsDetachedInBlock(BasicBlock *BB) { + return getValIsDetached(VID)[BB]; +} + void SSAUpdater::RewriteUse(Use &U) { Instruction *User = cast(U.getUser()); @@ -313,6 +368,18 @@ class SSAUpdaterTraits { return PoisonValue::get(Updater->ProtoType); } + /// BlockReattaches - Return true if this block is terminated with a + /// reattach, false otherwise. + static bool BlockReattaches(BasicBlock *BB, SSAUpdater *Updater) { + return isa(BB->getTerminator()); + } + + /// BlockReattaches - Return true if this block is terminated with a + /// detach, false otherwise. + static bool BlockDetaches(BasicBlock *BB, SSAUpdater *Updater) { + return isa(BB->getTerminator()); + } + /// CreateEmptyPHI - Create a new PHI instruction in the specified block. /// Reserve space for the operands but do not fill them in yet. static Value *CreateEmptyPHI(BasicBlock *BB, unsigned NumPreds, @@ -360,7 +427,8 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) { if (Value *V = AvailableVals[BB]) return V; - SSAUpdaterImpl Impl(this, &AvailableVals, InsertedPHIs); + SSAUpdaterImpl Impl(this, &AvailableVals, InsertedPHIs, + &getValIsDetached(VID)); return Impl.GetValue(BB); } @@ -481,7 +549,14 @@ void LoadAndStorePromoter::run(const SmallVectorImpl &Insts) { // Okay, now we rewrite all loads that use live-in values in the loop, // inserting PHI nodes as necessary. for (LoadInst *ALoad : LiveInLoads) { - Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent()); + BasicBlock *BB = ALoad->getParent(); + Value *NewVal = SSA.GetValueInMiddleOfBlock(BB); + + // Skip loads whose definitions are detached. + if (Instruction *Def = dyn_cast(NewVal)) + if (SSA.GetValueIsDetachedInBlock(Def->getParent())) + continue; + replaceLoadWithValue(ALoad, NewVal); // Avoid assertions in unreachable code. diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index f23e28888931dfc..560e06226aeacde 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -73,6 +73,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include #include @@ -180,6 +181,11 @@ static cl::opt MaxSwitchCasesPerResult( "max-switch-cases-per-result", cl::Hidden, cl::init(16), cl::desc("Limit cases to analyze when converting a switch to select")); +static cl::opt PreserveAllSpawns( + "simplifycfg-preserve-all-spawns", cl::Hidden, cl::init(false), + cl::desc("Temporary development switch to ensure SimplifyCFG does not " + "eliminate spawns that immediately sync.")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); @@ -267,6 +273,7 @@ class SimplifyCFGOpt { bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder); bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder); bool simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder); + bool simplifySync(SyncInst *SI); bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, IRBuilder<> &Builder); @@ -1511,6 +1518,16 @@ static bool shouldHoistCommonInstructions(Instruction *I1, Instruction *I2, if (C1->isMustTailCall() != C2->isMustTailCall()) return false; + // Disallow hoisting of setjmp. Although hoisting the setjmp technically + // produces valid IR, it seems hard to generate appropariate machine code from + // this IR, e.g., for X86. + if (IntrinsicInst *II = dyn_cast(I1)) + if (Intrinsic::eh_sjlj_setjmp == II->getIntrinsicID()) + return false; + if (IntrinsicInst *II = dyn_cast(I2)) + if (Intrinsic::eh_sjlj_setjmp == II->getIntrinsicID()) + return false; + if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2)) return false; @@ -1583,6 +1600,14 @@ static void hoistLockstepIdenticalDbgVariableRecords( } } +// Helper function to check if an instruction is a taskframe.create call. +static bool isTaskFrameCreate(const Instruction *I) { + if (const IntrinsicInst *II = dyn_cast(I)) + if (Intrinsic::taskframe_create == II->getIntrinsicID()) + return true; + return false; +} + /// Hoist any common code in the successor blocks up into the block. This /// function guarantees that BB dominates all successors. If EqTermsOnly is /// given, only perform hoisting in case both blocks only contain a terminator. @@ -1619,6 +1644,23 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB, SuccIterPairs.push_back(SuccIterPair(SuccItr, 0)); } + // Skip taskframe.create calls. + while (isTaskFrameCreate(I1)) + I1 = &*BB1_Itr++; + while (isTaskFrameCreate(I2)) + I2 = &*BB2_Itr++; + if (isa(I1)) + return false; + + BasicBlock *BIParent = BI->getParent(); + + bool Changed = false; + + auto _ = make_scope_exit([&]() { + if (Changed) + ++NumHoistCommonCode; + }); + // Check if only hoisting terminators is allowed. This does not add new // instructions to the hoist location. if (EqTermsOnly) { @@ -1772,6 +1814,24 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB, } ++NumSkipped; } + + // KITSUNE FIXME: I have no idea if this is correct. + I1 = &*BB1_Itr++; + I2 = &*BB2_Itr++; + // Skip debug info if it is not identical. + DbgInfoIntrinsic *DBI1 = dyn_cast(I1); + DbgInfoIntrinsic *DBI2 = dyn_cast(I2); + if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) { + while (isa(I1)) + I1 = &*BB1_Itr++; + while (isa(I2)) + I2 = &*BB2_Itr++; + } + // Skip taskframe.create calls. + while (isTaskFrameCreate(I1)) + I1 = &*BB1_Itr++; + while (isTaskFrameCreate(I2)) + I2 = &*BB2_Itr++; } } @@ -2778,8 +2838,13 @@ static bool MergeCompatibleInvokes(BasicBlock *BB, DomTreeUpdater *DTU) { // Record all the predecessors of this `landingpad`. As per verifier, // the only allowed predecessor is the unwind edge of an `invoke`. // We want to group "compatible" `invokes` into the same set to be merged. - for (BasicBlock *PredBB : predecessors(BB)) + for (BasicBlock *PredBB : predecessors(BB)) { + // Tapir allows a detach to be a predecessor of a landingpad. If we find a + // detach predecessor, quit early. + if (isa(PredBB->getTerminator())) + return Changed; Grouper.insert(cast(PredBB->getTerminator())); + } // And now, merge `invoke`s that were grouped togeter. for (ArrayRef Invokes : Grouper.Sets) { @@ -2894,7 +2959,8 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB, // means it's never concurrently read or written, hence moving the store // from under the condition will not introduce a data race. auto *AI = dyn_cast(getUnderlyingObject(StorePtr)); - if (AI && !PointerMayBeCaptured(AI, false, true)) + if (AI && !PointerMayBeCaptured(AI, false, true) && + GetDetachedCtx(LI->getParent()) == GetDetachedCtx(AI->getParent())) // Found a previous load, return it. return LI; } @@ -5106,6 +5172,14 @@ bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) { return !TrivialUnwindBlocks.empty(); } +static bool isTaskFrameUnassociated(const Value *TFCreate) { + for (const User *U : TFCreate->users()) + if (const Instruction *I = dyn_cast(U)) + if (isTapirIntrinsic(Intrinsic::taskframe_use, I)) + return false; + return true; +} + // Simplify resume that is only used by a single (non-phi) landing pad. bool SimplifyCFGOpt::simplifySingleResume(ResumeInst *RI) { BasicBlock *BB = RI->getParent(); @@ -5118,6 +5192,14 @@ bool SimplifyCFGOpt::simplifySingleResume(ResumeInst *RI) { make_range(LPInst->getNextNode(), RI))) return false; + // Check that no predecessor is a taskframe.resume for an unassociated + // taskframe. + for (const BasicBlock *Pred : predecessors(BB)) + if (isTaskFrameResume(Pred->getTerminator())) + if (isTaskFrameUnassociated( + cast(Pred->getTerminator())->getArgOperand(0))) + return false; + // Turn all invokes that unwind here into calls and delete the basic block. for (BasicBlock *Pred : llvm::make_early_inc_range(predecessors(BB))) { removeUnwindEdge(Pred, DTU); @@ -5398,9 +5480,10 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) { DTU->applyUpdates(Updates); Updates.clear(); } - auto *CI = cast(removeUnwindEdge(TI->getParent(), DTU)); - if (!CI->doesNotThrow()) - CI->setDoesNotThrow(); + if (auto *CI = + dyn_cast(removeUnwindEdge(TI->getParent(), DTU))) + if (!CI->doesNotThrow()) + CI->setDoesNotThrow(); Changed = true; } } else if (auto *CSI = dyn_cast(TI)) { @@ -5464,6 +5547,15 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) { new UnreachableInst(TI->getContext(), TI->getIterator()); TI->eraseFromParent(); Changed = true; + } else if (DetachInst *DI = dyn_cast(TI)) { + if (DI->getUnwindDest() == BB) { + // If the unwind destination of the detach is unreachable, simply remove + // the unwind edge. + removeUnwindEdge(DI->getParent(), DTU); + Changed = true; + } + // Detaches of unreachables are handled via + // serializeDetachOfUnreachable. } } @@ -7264,6 +7356,13 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI, // path instead and make ourselves dead. SmallSetVector UniquePreds(pred_begin(BB), pred_end(BB)); for (BasicBlock *Pred : UniquePreds) { + // Handle detach predecessors. + if (DetachInst *DI = dyn_cast(Pred->getTerminator())) { + assert(DI->getDetached() != BB && DI->getContinue() != BB && + DI->getUnwindDest() == BB && "unexpected detach successor"); + DI->setUnwindDest(OtherPred); + continue; + } InvokeInst *II = cast(Pred->getTerminator()); assert(II->getNormalDest() != BB && II->getUnwindDest() == BB && "unexpected successor"); @@ -7323,6 +7422,18 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI, !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB, DTU)) return true; + // If this branch goes to a reattach block with a single predecessor, merge + // the two blocks. + if (isa(Succ->getTerminator()) && Succ->getSinglePredecessor()) { + assert(!NeedCanonicalLoop && + "Reattach-terminated successor cannot by a loop header."); + // Preserve the name of BB, for cleanliness. + std::string BBName = BB->getName().str(); + MergeBasicBlockIntoOnlyPred(Succ, DTU); + Succ->setName(BBName); + return true; + } + // If the only instruction in the block is a seteq/setne comparison against a // constant, try to simplify the block. if (ICmpInst *ICI = dyn_cast(I)) @@ -7568,6 +7679,71 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { return false; } +bool SimplifyCFGOpt::simplifySync(SyncInst *SI) { + const Value *SyncRegion = SI->getSyncRegion(); + BasicBlock *Succ = SI->getSuccessor(0); + + // Get the first non-trivial instruction in the successor of the sync. Along + // the way, record a sync_unwind intrinsic for the sync if we find one. + Instruction *SyncUnwind = nullptr; + BasicBlock::iterator SuccI = + Succ->getFirstNonPHIOrDbg(true)->getIterator(); + if (isSyncUnwind(&*SuccI, SyncRegion)) { + SyncUnwind = &*SuccI; + if (isa(SyncUnwind)) + // We cannot eliminate syncs with associated sync-unwind that has an + // associated landingpad. + return false; + SuccI = Succ->getFirstNonPHIOrDbgOrSyncUnwind(true)->getIterator(); + } + + if (!SuccI->isTerminator()) + // There's nontrivial code in the successor of the sync, so don't eliminate + // the sync. + return false; + + if (SyncInst *SuccSI = dyn_cast(&*SuccI)) { + if (SuccSI->getSyncRegion() == SyncRegion) { + // The successor block is terminated by a sync in the same sync region, + // meaning the given sync is redundant. Eliminate the given sync. + if (SyncUnwind) + SyncUnwind->eraseFromParent(); + ReplaceInstWithInst(SI, BranchInst::Create(Succ)); + return requestResimplify(); + } + } + + // Otherwise check for an unconditional branch terminating the successor + // block. + if (!isa(*SuccI)) + return false; + + BranchInst *BI = dyn_cast(&*SuccI); + if (!BI->isUnconditional()) + return false; + + // Check if the successor of the unconditional branch simply contains a sync + // in the same sync region. + BasicBlock::iterator BrSuccI = + BI->getSuccessor(0)->getFirstNonPHIOrDbg(true)->getIterator(); + if (!BrSuccI->isTerminator()) + // There's nontrivial code in the successor of the sync, so don't eliminate + // it. + return false; + if (SyncInst *SuccSI = dyn_cast(&*BrSuccI)) { + if (SuccSI->getSyncRegion() == SyncRegion) { + // The successor block is terminated by a sync in the same sync region, + // meaning the given sync is redundant. Eliminate the given sync. + if (SyncUnwind) + SyncUnwind->eraseFromParent(); + ReplaceInstWithInst(SI, BranchInst::Create(Succ)); + return requestResimplify(); + } + } + + return false; +} + /// Check if passing a value to an instruction will cause undefined behavior. static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValueMayBeModified) { Constant *C = dyn_cast(V); @@ -7767,6 +7943,239 @@ static bool removeUndefIntroducingPredecessor(BasicBlock *BB, return false; } +/// If BB immediately syncs and BB's predecessor detaches, serialize the sync +/// and detach. This will allow normal serial optimization passes to remove the +/// blocks appropriately. Return false if BB does not terminate with a +/// reattach. +static bool serializeDetachToImmediateSync(BasicBlock *BB, + DomTreeUpdater *DTU) { + Instruction *I = BB->getFirstNonPHIOrDbgOrLifetime(); + if (isa(I)) { + // This block is empty + bool Changed = false; + // Collect the detach and reattach predecessors. + SmallPtrSet DetachPreds; + SmallVector ReattachPreds; + for (BasicBlock *PredBB : predecessors(BB)) { + if (DetachInst *DI = dyn_cast(PredBB->getTerminator())) { + // This transformation gets too complicated if the detached task might + // throw, so abort. + if (DI->hasUnwindDest()) + return false; + DetachPreds.insert(DI); + } + if (ReattachInst *RI = dyn_cast(PredBB->getTerminator())) + ReattachPreds.push_back(RI); + } + std::vector Updates; + Value *SyncRegion = cast(I)->getSyncRegion(); + for (DetachInst *DI : DetachPreds) { + BasicBlock *Detached = DI->getDetached(); + + // If this detached task uses a taskframe, mark those taskframe + // instrinsics to be erased. + SmallVector ToErase; + if (Value *TaskFrame = getTaskFrameUsed(Detached)) { + // If this detach uses a taskframe, record that taskframe.use. + for (User *U : TaskFrame->users()) { + if (IntrinsicInst *II = dyn_cast(U)) { + if (Intrinsic::taskframe_use == II->getIntrinsicID()) + ToErase.push_back(II); + else + // We need more complicated logic to effectively inline this + // taskframe, so abort. + return false; + } + } + ToErase.push_back(cast(TaskFrame)); + } + + // Replace the detach with a branch to the detached block. + BB->removePredecessor(DI->getParent()); + ReplaceInstWithInst(DI, BranchInst::Create(Detached)); + // Record update to DTU if DTU is available. + if (DTU) + Updates.push_back({DominatorTree::Delete, DI->getParent(), BB}); + + // Move static alloca instructions in the detached block to the + // appropriate entry block. + MoveStaticAllocasInBlock(cast(SyncRegion)->getParent(), + Detached, ReattachPreds); + + // Erase any instructions marked to be erased. + for (Instruction *I : ToErase) + I->eraseFromParent(); + + // We should not need to add new llvm.stacksave/llvm.stackrestore + // intrinsics, because we're not introducing new alloca's into a loop. + Changed = true; + } + for (Instruction *RI : ReattachPreds) { + // Replace the reattach with an unconditional branch. + ReplaceInstWithInst(RI, BranchInst::Create(BB)); + Changed = true; + } + // Update DTU if available. + if (DTU) + DTU->applyUpdates(Updates); + return Changed; + } + return false; +} + +/// If BB immediately reattaches and BB's predecessor detaches, serialize the +/// reattach and detach. This will allow normal serial optimization passes to +/// remove the blocks appropriately. Return false if BB does not terminate with +/// a reattach or predecessor does terminate with detach. +static bool serializeTrivialDetachedBlock(BasicBlock *BB, DomTreeUpdater *DTU) { + Instruction *I = BB->getFirstNonPHIOrDbgOrLifetime(); + SmallVector ToErase; + // Skip a possible taskframe.use intrinsic in the task. + if (isTapirIntrinsic(Intrinsic::taskframe_use, I)) { + Value *TaskFrame = cast(I)->getArgOperand(0); + // Check for any other uses of TaskFrame. + for (User *U : TaskFrame->users()) + if (U != I) + // We found another use of the taskframe, making it too complicated for + // us to handle. Abort. + return false; + ToErase.push_back(I); + ToErase.push_back(cast(TaskFrame)); + I = &*(++(I->getIterator())); + } + if (ReattachInst *RI = dyn_cast(I)) { + // This detached block is empty. + // Scan predecessors to verify that all of them detach BB. + for (BasicBlock *PredBB : predecessors(BB)) { + if (!isa(PredBB->getTerminator())) + return false; + } + // All predecessors detach BB, so we can serialize. Copy the predecessors + // into a separate vector, so we can safely remove the predecessors. + SmallVector Preds(pred_begin(BB), pred_end(BB)); + for (BasicBlock *PredBB : Preds) { + DetachInst *DI = dyn_cast(PredBB->getTerminator()); + BasicBlock *Detached = DI->getDetached(); + BasicBlock *Continue = DI->getContinue(); + assert(RI->getSuccessor(0) == Continue && + "Reattach destination does not match continue block of associated " + "detach."); + // Remove the predecessor through the detach from the continue block. + Continue->removePredecessor(PredBB); + // Serialize the detach: replace it with an unconditional branch. + ReplaceInstWithInst(DI, BranchInst::Create(Detached)); + // Update DTU if available. + if (DTU) + DTU->applyUpdates({{DominatorTree::Delete, PredBB, Continue}}); + } + // Erase any instructions marked to be erased. + for (Instruction *I : ToErase) + I->eraseFromParent(); + // Serialize the reattach: replace it with an unconditional branch. + ReplaceInstWithInst(RI, BranchInst::Create(RI->getSuccessor(0))); + return true; + } + return false; +} + +/// If BB detaches an CFG that cannot reach the continuation, serialize the +/// detach. Assuming the CFG is valid, this scenario arises when the detached +/// CFG is terminated by unreachable instructions. +static bool serializeDetachOfUnreachable(BasicBlock *BB, DomTreeUpdater *DTU) { + // This method assumes that the detached CFG is valid. + Instruction *I = BB->getTerminator(); + if (DetachInst *DI = dyn_cast(I)) { + // Check if continuation of the detach is not reached by reattach + // instructions. If the detached CFG is valid, then the detached CFG must + // be terminated by unreachable instructions. + BasicBlock *Continue = DI->getContinue(); + for (BasicBlock *PredBB : predecessors(Continue)) + if (isa(PredBB->getTerminator())) + return false; + + if (DI->hasUnwindDest()) + // These detaches are too complicated for SimplifyCFG to handle. Abort. + return false; + + // If this detached task uses a taskframe, mark those taskframe instrinsics + // to be erased. + SmallVector ToErase; + if (Value *TaskFrame = getTaskFrameUsed(DI->getDetached())) { + // If this detach uses a taskframe, remove that taskframe. + for (User *U : TaskFrame->users()) { + if (IntrinsicInst *II = dyn_cast(U)) { + if (Intrinsic::taskframe_use == II->getIntrinsicID()) + ToErase.push_back(II); + else + // We need more complicated logic to effectively inline this + // taskframe, so abort. + return false; + } + } + ToErase.push_back(cast(TaskFrame)); + } + + // Remove the predecessor through the detach from the continue block. + Continue->removePredecessor(BB); + // Update DTU if available. + if (DTU) + DTU->applyUpdates({{DominatorTree::Delete, BB, Continue}}); + // Replace the detach with a branch to the detached block. + ReplaceInstWithInst(DI, BranchInst::Create(DI->getDetached())); + // Erase any instructions marked to be erased. + for (Instruction *I : ToErase) + I->eraseFromParent(); + return true; + } + return false; +} + +// Remove any syncs whose sync region is empty, meaning that the region contains +// no detach instructions. These sync instructions don't synchronize anything, +// so they can be removed. +static bool removeEmptySyncs(BasicBlock *BB) { + if (SyncInst *SI = dyn_cast(BB->getTerminator())) { + // Get the sync region containing this sync + Value *SyncRegion = SI->getSyncRegion(); + bool SyncRegionIsEmpty = true; + SmallVector Syncs; + // Scan the Tapir instructions in this sync region. + for (User *U : SyncRegion->users()) { + // If the sync region contains a detach or a reattach, then it's not + // empty. + if (isa(U) || isa(U)) + SyncRegionIsEmpty = false; + // Collect the syncs in this region. + else if (isa(U)) + Syncs.push_back(cast(U)); + } + // If the sync region is empty, then remove all sync instructions in it. + if (SyncRegionIsEmpty) { + SmallPtrSet MaybeDeadSyncUnwinds; + for (SyncInst *Sync : Syncs) { + // Check for any sync.unwinds that might now be dead. + Instruction *MaybeSyncUnwind = + Sync->getSuccessor(0)->getFirstNonPHIOrDbgOrLifetime(); + if (isSyncUnwind(MaybeSyncUnwind, SyncRegion)) + MaybeDeadSyncUnwinds.insert(cast(MaybeSyncUnwind)); + + LLVM_DEBUG(dbgs() << "Removing empty sync " << *Sync << "\n"); + ReplaceInstWithInst(Sync, BranchInst::Create(Sync->getSuccessor(0))); + } + // Remove any dead sync.unwinds. + for (CallBase *CB : MaybeDeadSyncUnwinds) { + LLVM_DEBUG(dbgs() << "Remove dead sync unwind " << *CB << "? "); + if (removeDeadSyncUnwind(CB)) + LLVM_DEBUG(dbgs() << "Yes.\n"); + else + LLVM_DEBUG(dbgs() << "No.\n"); + } + return true; + } + } + return false; +} + bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) { bool Changed = false; @@ -7794,6 +8203,15 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) { if (removeUndefIntroducingPredecessor(BB, DTU, Options.AC)) return requestResimplify(); + // Check for and remove trivial detached blocks. + Changed |= serializeTrivialDetachedBlock(BB, DTU); + if (!PreserveAllSpawns) + Changed |= serializeDetachToImmediateSync(BB, DTU); + Changed |= serializeDetachOfUnreachable(BB, DTU); + + // Check for and remove sync instructions in empty sync regions. + Changed |= removeEmptySyncs(BB); + // Merge basic blocks into their predecessor if there is only one distinct // pred, and if there is only one distinct successor of the predecessor, and // if there are no PHI nodes. @@ -7845,6 +8263,8 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) { case Instruction::IndirectBr: Changed |= simplifyIndirectBr(cast(Terminator)); break; + case Instruction::Sync: + Changed |= simplifySync(cast(Terminator)); } return Changed; diff --git a/llvm/lib/Transforms/Utils/TapirUtils.cpp b/llvm/lib/Transforms/Utils/TapirUtils.cpp new file mode 100644 index 000000000000000..05dad65aae8523b --- /dev/null +++ b/llvm/lib/Transforms/Utils/TapirUtils.cpp @@ -0,0 +1,2524 @@ +//===- TapirUtils.cpp - Utility methods for Tapir --------------*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file utility methods for handling code containing Tapir instructions. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/TapirUtils.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/EHPersonalities.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +using namespace llvm; + +#define DEBUG_TYPE "tapirutils" + +// Check if the given instruction is an intrinsic with the specified ID. If a +// value \p V is specified, then additionally checks that the first argument of +// the intrinsic matches \p V. +bool llvm::isTapirIntrinsic(Intrinsic::ID ID, const Instruction *I, + const Value *V) { + if (const CallBase *CB = dyn_cast(I)) + if (const Function *Called = CB->getCalledFunction()) + if (ID == Called->getIntrinsicID()) + if (!V || (V == CB->getArgOperand(0))) + return true; + return false; +} + +/// Returns true if the given instruction performs a detached.rethrow, false +/// otherwise. If \p SyncRegion is specified, then additionally checks that the +/// detached.rethrow uses \p SyncRegion. +bool llvm::isDetachedRethrow(const Instruction *I, const Value *SyncRegion) { + return isa(I) && + isTapirIntrinsic(Intrinsic::detached_rethrow, I, SyncRegion); +} + +/// Returns true if the given instruction performs a taskframe.resume, false +/// otherwise. If \p TaskFrame is specified, then additionally checks that the +/// taskframe.resume uses \p TaskFrame. +bool llvm::isTaskFrameResume(const Instruction *I, const Value *TaskFrame) { + return isa(I) && + isTapirIntrinsic(Intrinsic::taskframe_resume, I, TaskFrame); +} + +/// Returns true if the given basic block \p B is a placeholder successor of a +/// taskframe.resume or detached.rethrow. +bool llvm::isTapirPlaceholderSuccessor(const BasicBlock *B) { + for (const BasicBlock *Pred : predecessors(B)) { + if (!isDetachedRethrow(Pred->getTerminator()) && + !isTaskFrameResume(Pred->getTerminator())) + return false; + + const InvokeInst *II = dyn_cast(Pred->getTerminator()); + if (B != II->getNormalDest()) + return false; + } + return true; +} + +/// Returns a taskframe.resume that uses the given taskframe, or nullptr if no +/// taskframe.resume uses this taskframe. +InvokeInst *llvm::getTaskFrameResume(Value *TaskFrame) { + // It should suffice to get the unwind destination of the first + // taskframe.resume we find. + for (User *U : TaskFrame->users()) + if (Instruction *I = dyn_cast(U)) + if (isTaskFrameResume(I)) + return cast(I); + return nullptr; +} + +/// Returns the unwind destination of a taskframe.resume that uses the given +/// taskframe, or nullptr if no such unwind destination exists. +BasicBlock *llvm::getTaskFrameResumeDest(Value *TaskFrame) { + if (InvokeInst *TFResume = getTaskFrameResume(TaskFrame)) + return TFResume->getUnwindDest(); + return nullptr; +} + +/// Returns true if the given instruction is a sync.uwnind, false otherwise. If +/// \p SyncRegion is specified, then additionally checks that the sync.unwind +/// uses \p SyncRegion. +bool llvm::isSyncUnwind(const Instruction *I, const Value *SyncRegion, + bool CheckForInvoke) { + if (isTapirIntrinsic(Intrinsic::sync_unwind, I, SyncRegion)) + return !CheckForInvoke || isa(I); + return false; +} + +/// Returns true if BasicBlock \p B is a placeholder successor, that is, it's +/// the immediate successor of only detached-rethrow and taskframe-resume +/// instructions. +bool llvm::isPlaceholderSuccessor(const BasicBlock *B) { + for (const BasicBlock *Pred : predecessors(B)) { + if (!isDetachedRethrow(Pred->getTerminator()) && + !isTaskFrameResume(Pred->getTerminator())) + return false; + if (B == cast(Pred->getTerminator())->getUnwindDest()) + return false; + } + return true; +} + +/// Returns true if the given basic block ends a taskframe, false otherwise. If +/// \p TaskFrame is specified, then additionally checks that the +/// taskframe.end uses \p TaskFrame. +bool llvm::endsTaskFrame(const BasicBlock *B, const Value *TaskFrame) { + const Instruction *I = B->getTerminator()->getPrevNode(); + return I && isTapirIntrinsic(Intrinsic::taskframe_end, I, TaskFrame); +} + +/// Returns the spindle containing the taskframe.create used by task \p T, or +/// the entry spindle of \p T if \p T has no such taskframe.create spindle. +Spindle *llvm::getTaskFrameForTask(Task *T) { + Spindle *TF = T->getTaskFrameCreateSpindle(); + if (!TF) + TF = T->getEntrySpindle(); + return TF; +} + +// Removes the given sync.unwind instruction, if it is dead. Returns true if +// the sync.unwind was removed, false otherwise. +bool llvm::removeDeadSyncUnwind(CallBase *SyncUnwind, DomTreeUpdater *DTU) { + assert(isSyncUnwind(SyncUnwind) && + "removeDeadSyncUnwind not called on a sync.unwind."); + const Value *SyncRegion = SyncUnwind->getArgOperand(0); + + // Scan predecessor blocks for syncs using this sync.unwind. + for (BasicBlock *Pred : predecessors(SyncUnwind->getParent())) + if (SyncInst *SI = dyn_cast(Pred->getTerminator())) + if (SyncRegion == SI->getSyncRegion()) + return false; + + // We found no predecessor syncs that use this sync.unwind, so remove it. + if (InvokeInst *II = dyn_cast(SyncUnwind)) { + II->getUnwindDest()->removePredecessor(II->getParent()); + if (DTU) + DTU->applyUpdates( + {{DominatorTree::Delete, II->getUnwindDest(), II->getParent()}}); + ReplaceInstWithInst(II, BranchInst::Create(II->getNormalDest())); + } else { + SyncUnwind->eraseFromParent(); + } + return true; +} + +/// Returns true if the reattach instruction appears to match the given detach +/// instruction, false otherwise. +/// +/// If a dominator tree is not given, then this method does a best-effort check. +/// In particular, this function might return true when the reattach instruction +/// does not actually match the detach instruction, but instead matches a +/// sibling detach instruction with the same continuation. This best-effort +/// check is sufficient in some cases, such as during a traversal of a detached +/// task. +bool llvm::ReattachMatchesDetach(const ReattachInst *RI, const DetachInst *DI, + DominatorTree *DT) { + // Check that the reattach instruction belonds to the same sync region as the + // detach instruction. + if (RI->getSyncRegion() != DI->getSyncRegion()) + return false; + + // Check that the destination of the reattach matches the continue destination + // of the detach. + if (RI->getDetachContinue() != DI->getContinue()) + return false; + + // If we have a dominator tree, check that the detach edge dominates the + // reattach. + if (DT) { + BasicBlockEdge DetachEdge(DI->getParent(), DI->getDetached()); + if (!DT->dominates(DetachEdge, RI->getParent())) + return false; + } + + return true; +} + +/// Returns true of the given task itself contains a sync instruction. +bool llvm::taskContainsSync(const Task *T) { + for (const Spindle *S : + depth_first>(T->getEntrySpindle())) { + if (S == T->getEntrySpindle()) + continue; + for (const BasicBlock *Pred : predecessors(S->getEntry())) + if (isa(Pred->getTerminator())) + return true; + } + return false; +} + +/// Return the result of AI->isStaticAlloca() if AI were moved to the entry +/// block. Allocas used in inalloca calls and allocas of dynamic array size +/// cannot be static. +/// (Borrowed from Transforms/Utils/InlineFunction.cpp) +static bool allocaWouldBeStaticInEntry(const AllocaInst *AI) { + return isa(AI->getArraySize()) && !AI->isUsedWithInAlloca(); +} + +// Check whether this Value is used by a lifetime intrinsic. +static bool isUsedByLifetimeMarker(Value *V) { + for (User *U : V->users()) { + if (IntrinsicInst *II = dyn_cast(U)) { + switch (II->getIntrinsicID()) { + default: + break; + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + return true; + } + } + } + return false; +} + +// Check whether the given alloca already has lifetime.start or lifetime.end +// intrinsics. +static bool hasLifetimeMarkers(AllocaInst *AI) { + Type *Ty = AI->getType(); + Type *Int8PtrTy = + Type::getInt8PtrTy(Ty->getContext(), Ty->getPointerAddressSpace()); + if (Ty == Int8PtrTy) + return isUsedByLifetimeMarker(AI); + + // Do a scan to find all the casts to i8*. + for (User *U : AI->users()) { + if (U->getType() != Int8PtrTy) + continue; + if (U->stripPointerCasts() != AI) + continue; + if (isUsedByLifetimeMarker(U)) + return true; + } + return false; +} + +// Move static allocas in Block into Entry, which is assumed to dominate Block. +// Leave lifetime markers behind in Block and before each instruction in +// ExitPoints for those static allocas. Returns true if Block still contains +// dynamic allocas, which cannot be moved. +bool llvm::MoveStaticAllocasInBlock( + BasicBlock *Entry, BasicBlock *Block, + SmallVectorImpl &ExitPoints) { + Function *F = Entry->getParent(); + SmallVector StaticAllocas; + bool ContainsDynamicAllocas = false; + BasicBlock::iterator InsertPoint = Entry->begin(); + for (BasicBlock::iterator I = Block->begin(), E = Block->end(); I != E;) { + AllocaInst *AI = dyn_cast(I++); + if (!AI) + continue; + + if (!allocaWouldBeStaticInEntry(AI)) { + ContainsDynamicAllocas = true; + continue; + } + + StaticAllocas.push_back(AI); + + // Scan for the block of allocas that we can move over, and move them all at + // once. + while (isa(I) && + allocaWouldBeStaticInEntry(cast(I))) { + StaticAllocas.push_back(cast(I)); + ++I; + } + + // Transfer all of the allocas over in a block. Using splice means that the + // instructions aren't removed from the symbol table, then reinserted. + Entry->splice(InsertPoint, &*Block, AI->getIterator(), I); + } + + // Move any syncregion_start's into the entry basic block. + for (BasicBlock::iterator I = Block->begin(), E = Block->end(); I != E;) { + IntrinsicInst *II = dyn_cast(I++); + if (!II) + continue; + if (Intrinsic::syncregion_start != II->getIntrinsicID()) + continue; + + while (isa(I) && + Intrinsic::syncregion_start == + cast(I)->getIntrinsicID()) + ++I; + + Entry->splice(InsertPoint, &*Block, II->getIterator(), I); + } + + // Leave lifetime markers for the static alloca's, scoping them to the + // from cloned block to cloned exit. + if (!StaticAllocas.empty()) { + IRBuilder<> Builder(&*Block->getFirstInsertionPt()); + for (unsigned ai = 0, ae = StaticAllocas.size(); ai != ae; ++ai) { + AllocaInst *AI = StaticAllocas[ai]; + // Don't mark swifterror allocas. They can't have bitcast uses. + if (AI->isSwiftError()) + continue; + + // If the alloca is already scoped to something smaller than the whole + // function then there's no need to add redundant, less accurate markers. + if (hasLifetimeMarkers(AI)) + continue; + + // Try to determine the size of the allocation. + ConstantInt *AllocaSize = nullptr; + if (ConstantInt *AIArraySize = + dyn_cast(AI->getArraySize())) { + auto &DL = F->getParent()->getDataLayout(); + Type *AllocaType = AI->getAllocatedType(); + uint64_t AllocaTypeSize = DL.getTypeAllocSize(AllocaType); + uint64_t AllocaArraySize = AIArraySize->getLimitedValue(); + + // Don't add markers for zero-sized allocas. + if (AllocaArraySize == 0) + continue; + + // Check that array size doesn't saturate uint64_t and doesn't + // overflow when it's multiplied by type size. + if (AllocaArraySize != ~0ULL && + UINT64_MAX / AllocaArraySize >= AllocaTypeSize) { + AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()), + AllocaArraySize * AllocaTypeSize); + } + } + + Builder.CreateLifetimeStart(AI, AllocaSize); + for (Instruction *ExitPoint : ExitPoints) + IRBuilder<>(ExitPoint).CreateLifetimeEnd(AI, AllocaSize); + } + } + + return ContainsDynamicAllocas; +} + +namespace { +/// A class for recording information about inlining a landing pad. +class LandingPadInliningInfo { + /// Destination of the invoke's unwind. + BasicBlock *OuterResumeDest; + + /// Destination for the callee's resume. + BasicBlock *InnerResumeDest = nullptr; + + /// LandingPadInst associated with the detach. + Value *SpawnerLPad = nullptr; + + /// PHI for EH values from landingpad insts. + PHINode *InnerEHValuesPHI = nullptr; + + SmallVector UnwindDestPHIValues; + + /// Dominator tree to update. + DominatorTree *DT = nullptr; + +public: + LandingPadInliningInfo(DetachInst *DI, BasicBlock *EHContinue, + Value *LPadValInEHContinue, + DominatorTree *DT = nullptr) + : OuterResumeDest(EHContinue), SpawnerLPad(LPadValInEHContinue), DT(DT) { + // Find the predecessor block of OuterResumeDest. + BasicBlock *DetachBB = DI->getParent(); + BasicBlock *DetachUnwind = DI->getUnwindDest(); + while (DetachUnwind != OuterResumeDest) { + DetachBB = DetachUnwind; + DetachUnwind = DetachUnwind->getUniqueSuccessor(); + } + + // If there are PHI nodes in the unwind destination block, we need to keep + // track of which values came into them from the detach before removing the + // edge from this block. + BasicBlock::iterator I = OuterResumeDest->begin(); + for (; isa(I); ++I) { + if (&*I == LPadValInEHContinue) + continue; + // Save the value to use for this edge. + PHINode *PHI = cast(I); + UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(DetachBB)); + } + } + + LandingPadInliningInfo(InvokeInst *TaskFrameResume, + DominatorTree *DT = nullptr) + : OuterResumeDest(TaskFrameResume->getUnwindDest()), + SpawnerLPad(TaskFrameResume->getLandingPadInst()), DT(DT) { + // If there are PHI nodes in the unwind destination block, we need to keep + // track of which values came into them from the detach before removing the + // edge from this block. + BasicBlock *InvokeBB = TaskFrameResume->getParent(); + BasicBlock::iterator I = OuterResumeDest->begin(); + for (; isa(I); ++I) { + // Save the value to use for this edge. + PHINode *PHI = cast(I); + UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB)); + } + } + + /// The outer unwind destination is the target of unwind edges introduced for + /// calls within the inlined function. + BasicBlock *getOuterResumeDest() const { return OuterResumeDest; } + + BasicBlock *getInnerResumeDest(); + + /// Forward a task resume - a terminator, such as a detached.rethrow or + /// taskframe.resume, marking the exit from a task for exception handling - to + /// the spawner's landing pad block. When the landing pad block has only one + /// predecessor, this is a simple branch. When there is more than one + /// predecessor, we need to split the landing pad block after the landingpad + /// instruction and jump to there. + void forwardTaskResume(InvokeInst *TR); + + /// Add incoming-PHI values to the unwind destination block for the given + /// basic block, using the values for the original invoke's source block. + void addIncomingPHIValuesFor(BasicBlock *BB) const { + addIncomingPHIValuesForInto(BB, OuterResumeDest); + } + + void addIncomingPHIValuesForInto(BasicBlock *Src, BasicBlock *Dest) const { + BasicBlock::iterator I = Dest->begin(); + for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) { + PHINode *Phi = cast(I); + Phi->addIncoming(UnwindDestPHIValues[i], Src); + } + } +}; +} // end anonymous namespace + +/// Get or create a target for the branch from ResumeInsts. +BasicBlock *LandingPadInliningInfo::getInnerResumeDest() { + if (InnerResumeDest) + return InnerResumeDest; + + // Split the outer resume destionation. + BasicBlock::iterator SplitPoint; + if (isa(SpawnerLPad)) + SplitPoint = ++cast(SpawnerLPad)->getIterator(); + else + SplitPoint = OuterResumeDest->getFirstNonPHI()->getIterator(); + InnerResumeDest = OuterResumeDest->splitBasicBlock( + SplitPoint, OuterResumeDest->getName() + ".body"); + if (DT) + // OuterResumeDest dominates InnerResumeDest, which dominates all other + // nodes dominated by OuterResumeDest. + if (DomTreeNode *OldNode = DT->getNode(OuterResumeDest)) { + std::vector Children(OldNode->begin(), OldNode->end()); + + DomTreeNode *NewNode = DT->addNewBlock(InnerResumeDest, OuterResumeDest); + for (DomTreeNode *I : Children) + DT->changeImmediateDominator(I, NewNode); + } + + // The number of incoming edges we expect to the inner landing pad. + const unsigned PHICapacity = 2; + + // Create corresponding new PHIs for all the PHIs in the outer landing pad. + Instruction *InsertPoint = &InnerResumeDest->front(); + BasicBlock::iterator I = OuterResumeDest->begin(); + for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) { + PHINode *OuterPHI = cast(I); + PHINode *InnerPHI = + PHINode::Create(OuterPHI->getType(), PHICapacity, + OuterPHI->getName() + ".lpad-body", InsertPoint); + OuterPHI->replaceAllUsesWith(InnerPHI); + InnerPHI->addIncoming(OuterPHI, OuterResumeDest); + } + + // Create a PHI for the exception values. + InnerEHValuesPHI = PHINode::Create(SpawnerLPad->getType(), PHICapacity, + "eh.lpad-body", InsertPoint); + SpawnerLPad->replaceAllUsesWith(InnerEHValuesPHI); + InnerEHValuesPHI->addIncoming(SpawnerLPad, OuterResumeDest); + + // All done. + return InnerResumeDest; +} + +// Helper method to remove Pred from the PHI nodes of BB, if Pred is present in +// those PHI nodes. Unlike BasicBlock::removePredecessor, this method does not +// error if Pred is not found in a PHI node of BB. +static void maybeRemovePredecessor(BasicBlock *BB, BasicBlock *Pred) { + for (PHINode &PN : BB->phis()) { + int BBIdx = PN.getBasicBlockIndex(Pred); + if (-1 != BBIdx) + PN.removeIncomingValue(BBIdx); + } +} + +/// Forward a task resume - a terminator, such as a detached.rethrow or +/// taskframe.resume, marking the exit from a task for exception handling - to +/// the spawner's landing pad block. When the landing pad block has only one +/// predecessor, this is a simple branch. When there is more than one +/// predecessor, we need to split the landing pad block after the landingpad +/// instruction and jump to there. +void LandingPadInliningInfo::forwardTaskResume(InvokeInst *TR) { + BasicBlock *Dest = getInnerResumeDest(); + BasicBlock *Src = TR->getParent(); + + BranchInst::Create(Dest, Src); + if (DT) + DT->changeImmediateDominator(Dest, + DT->findNearestCommonDominator(Dest, Src)); + + // Update the PHIs in the destination. They were inserted in an order which + // makes this work. + addIncomingPHIValuesForInto(Src, Dest); + + InnerEHValuesPHI->addIncoming(TR->getOperand(1), Src); + + // Update the DT + BasicBlock *NormalDest = nullptr, *UnwindDest = nullptr; + SmallVector Updates; + if (DT) { + if (TR->getNormalDest()->getSinglePredecessor()) + NormalDest = TR->getNormalDest(); + Updates.push_back({DominatorTree::Delete, Src, TR->getNormalDest()}); + + if (TR->getUnwindDest()->getSinglePredecessor()) + UnwindDest = TR->getUnwindDest(); + Updates.push_back({DominatorTree::Delete, Src, TR->getUnwindDest()}); + } + + // Remove the TR + if (!NormalDest) + TR->getNormalDest()->removePredecessor(Src); + if (!UnwindDest) + TR->getUnwindDest()->removePredecessor(Src); + + TR->eraseFromParent(); + + if (DT) { + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + DTU.applyUpdates(Updates); + } + + if (NormalDest) { + for (BasicBlock *Succ : successors(NormalDest)) + maybeRemovePredecessor(Succ, NormalDest); + NormalDest->eraseFromParent(); + } + if (UnwindDest) { + for (BasicBlock *Succ : successors(UnwindDest)) + maybeRemovePredecessor(Succ, UnwindDest); + UnwindDest->eraseFromParent(); + } +} + +static void +handleDetachedLandingPads(DetachInst *DI, BasicBlock *EHContinue, + Value *LPadValInEHContinue, + SmallPtrSetImpl &InlinedLPads, + SmallVectorImpl &DetachedRethrows, + DominatorTree *DT = nullptr) { + LandingPadInliningInfo DetUnwind(DI, EHContinue, LPadValInEHContinue, DT); + + // Append the clauses from the outer landing pad instruction into the inlined + // landing pad instructions. + LandingPadInst *OuterLPad = DI->getLandingPadInst(); + for (LandingPadInst *InlinedLPad : InlinedLPads) { + unsigned OuterNum = OuterLPad->getNumClauses(); + InlinedLPad->reserveClauses(OuterNum); + for (unsigned OuterIdx = 0; OuterIdx != OuterNum; ++OuterIdx) + InlinedLPad->addClause(OuterLPad->getClause(OuterIdx)); + if (OuterLPad->isCleanup()) + InlinedLPad->setCleanup(true); + } + + // Forward the detached rethrows. + for (Instruction *DR : DetachedRethrows) + DetUnwind.forwardTaskResume(cast(DR)); +} + +void llvm::cloneEHBlocks(Function *F, + SmallVectorImpl &EHBlocksToClone, + SmallPtrSetImpl &EHBlockPreds, + const char *Suffix, + SmallPtrSetImpl *InlinedLPads, + SmallVectorImpl *DetachedRethrows, + DominatorTree *DT, LoopInfo *LI) { + ValueToValueMapTy VMap; + SmallVector NewBlocks; + SmallPtrSet NewBlocksSet; + SmallPtrSet NewInlinedLPads; + SmallPtrSet NewDetachedRethrows; + NewLoopsMap NewLoops; + for (BasicBlock *BB : EHBlocksToClone) { + BasicBlock *New = CloneBasicBlock(BB, VMap, Suffix, F); + VMap[BB] = New; + if (DT) + DT->addNewBlock(New, DT->getRoot()); + + // If the cloned block is inside of a loop, update LoopInfo. + if (LI && LI->getLoopFor(BB)) { + Loop *OldLoop = LI->getLoopFor(BB); + Loop *ParentLoop = OldLoop->getParentLoop(); + if (ParentLoop && !NewLoops.count(ParentLoop)) + NewLoops[ParentLoop] = ParentLoop; + addClonedBlockToLoopInfo(BB, New, LI, NewLoops); + } + + NewBlocks.push_back(New); + NewBlocksSet.insert(New); + } + + // Remap instructions in the cloned blocks based on VMap. + remapInstructionsInBlocks(NewBlocks, VMap); + + SmallPtrSet NewSuccSet; + // For all old successors, remove the predecessors in EHBlockPreds. + for (BasicBlock *EHPred : EHBlockPreds) + for (BasicBlock *OldSucc : successors(EHPred)) + if (VMap.count(OldSucc)) { + OldSucc->removePredecessor(EHPred); + NewSuccSet.insert(cast(VMap[OldSucc])); + } + + // For all new successors, remove the predecessors not in EHBlockPreds. + for (BasicBlock *NewSucc : NewSuccSet) { + for (BasicBlock::iterator I = NewSucc->begin(); isa(I);) { + PHINode *PN = cast(I++); + + // NOTE! This loop walks backwards for a reason! First off, this minimizes + // the cost of removal if we end up removing a large number of values, and + // second off, this ensures that the indices for the incoming values + // aren't invalidated when we remove one. + for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i) + if (!EHBlockPreds.count(PN->getIncomingBlock(i))) + PN->removeIncomingValue(i, false); + } + } + + // Update the dominator tree and edges from EHBlockPreds to cloned EHBlocks. + for (BasicBlock *EHBlock : EHBlocksToClone) { + BasicBlock *IDomBB = nullptr; + if (DT) { + IDomBB = DT->getNode(EHBlock)->getIDom()->getBlock(); + if (VMap.count(IDomBB)) { + DT->changeImmediateDominator(cast(VMap[EHBlock]), + cast(VMap[IDomBB])); + } else { + IDomBB = nullptr; + // Get the idom of EHBlock's predecessors. + for (BasicBlock *Pred : predecessors(EHBlock)) { + if (EHBlockPreds.contains(Pred)) { + if (IDomBB) + IDomBB = DT->findNearestCommonDominator(IDomBB, Pred); + else + IDomBB = Pred; + } + } + assert(IDomBB && "Found no predecessors of EHBlock in EHBlockPreds."); + // Use this computed idom (or its clone) as the idom of the cloned + // EHBlock. + if (VMap.count(IDomBB)) { + DT->changeImmediateDominator(cast(VMap[EHBlock]), + cast(VMap[IDomBB])); + } else { + DT->changeImmediateDominator(cast(VMap[EHBlock]), IDomBB); + } + } + } + } + + // Move the edges from Preds to point to NewEHBlock instead of EHBlock. + for (BasicBlock *EHBlock : EHBlocksToClone) { + BasicBlock *NewEHBlock = cast(VMap[EHBlock]); + DomTreeNodeBase *Node = DT ? DT->getNode(EHBlock) : nullptr; + BasicBlock *EHBlockIDom = Node ? Node->getIDom()->getBlock() : nullptr; + for (BasicBlock *Pred : EHBlockPreds) { + // This is slightly more strict than necessary; the minimum requirement is + // that there be no more than one indirectbr branching to BB. And all + // BlockAddress uses would need to be updated. + assert(!isa(Pred->getTerminator()) && + "Cannot split an edge from an IndirectBrInst"); + Pred->getTerminator()->replaceUsesOfWith(EHBlock, NewEHBlock); + if (DT && EHBlockIDom) + DT->deleteEdge(Pred, EHBlock); + } + } + + // Update all successors of the cloned EH blocks. + for (BasicBlock *BB : EHBlocksToClone) { + for (BasicBlock *Succ : successors(BB)) { + if (NewBlocksSet.count(Succ) || VMap.count(Succ)) + continue; + + // Update the PHI's in the successor of the cloned EH block. + for (PHINode &PN : Succ->phis()) { + Value *Val = PN.getIncomingValueForBlock(BB); + Value *NewVal = VMap.count(Val) ? cast(VMap[Val]) : Val; + PN.addIncoming(NewVal, cast(VMap[BB])); + } + } + } + + if (DT && LI) { + // If any EHBlocks become unreachable, update LoopInfo to remove the + // relevant loops. + for (BasicBlock *EHBlock : EHBlocksToClone) { + if (!DT->isReachableFromEntry(EHBlock)) { + Loop *L = nullptr; + if (LI->isLoopHeader(EHBlock)) { + // Delete the whole loop. + L = LI->getLoopFor(EHBlock); + if (Loop *ParentL = L->getParentLoop()) + ParentL->removeChildLoop(llvm::find(*ParentL, L)); + else + LI->removeLoop(llvm::find(*LI, L)); + } + LI->removeBlock(EHBlock); + // If EHBlock is a loop header, finish destroying the whole loop. + if (L) + LI->destroy(L); + } + } + } + + // Move the new InlinedLPads and DetachedRethrows to the appropriate + // set/vector. + if (InlinedLPads) { + for (LandingPadInst *LPad : *InlinedLPads) { + if (VMap.count(LPad)) + NewInlinedLPads.insert(cast(VMap[LPad])); + else + NewInlinedLPads.insert(LPad); + } + InlinedLPads->clear(); + for (LandingPadInst *LPad : NewInlinedLPads) + InlinedLPads->insert(LPad); + } + if (DetachedRethrows) { + for (Instruction *DR : *DetachedRethrows) { + if (VMap.count(DR)) + NewDetachedRethrows.insert(cast(VMap[DR])); + else + NewDetachedRethrows.insert(DR); + } + DetachedRethrows->clear(); + for (Instruction *DR : NewDetachedRethrows) + DetachedRethrows->push_back(DR); + } +} + +// Helper function to find landingpads in the specified taskframe. +static void +getTaskFrameLandingPads(Value *TaskFrame, Instruction *TaskFrameResume, + SmallPtrSetImpl &InlinedLPads) { + const BasicBlock *TaskFrameBB = cast(TaskFrame)->getParent(); + SmallVector Worklist; + SmallPtrSet Visited; + // Add the parent of TaskFrameResume to the worklist. + Worklist.push_back(TaskFrameResume->getParent()); + + while (!Worklist.empty()) { + BasicBlock *BB = Worklist.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + // Terminate the search once we encounter the BB where the taskframe is + // defined. + if (TaskFrameBB == BB) + continue; + + // If we find a landingpad, add it to the set. + if (BB->isLandingPad()) + InlinedLPads.insert(BB->getLandingPadInst()); + + // Add predecessors to the worklist, but skip any predecessors within nested + // tasks or nested taskframes. + for (BasicBlock *Pred : predecessors(BB)) { + if (isa(Pred->getTerminator()) || + isDetachedRethrow(Pred->getTerminator()) || + isTaskFrameResume(Pred->getTerminator())) + continue; + Worklist.push_back(Pred); + } + } +} + +// Helper method to handle a given taskframe.resume. +static void handleTaskFrameResume(Value *TaskFrame, + Instruction *TaskFrameResume, + DominatorTree *DT = nullptr) { + // Get landingpads to inline. + SmallPtrSet InlinedLPads; + getTaskFrameLandingPads(TaskFrame, TaskFrameResume, InlinedLPads); + + InvokeInst *TFR = cast(TaskFrameResume); + LandingPadInliningInfo TFResumeDest(TFR, DT); + + // Append the clauses from the outer landing pad instruction into the inlined + // landing pad instructions. + LandingPadInst *OuterLPad = TFR->getLandingPadInst(); + for (LandingPadInst *InlinedLPad : InlinedLPads) { + unsigned OuterNum = OuterLPad->getNumClauses(); + InlinedLPad->reserveClauses(OuterNum); + for (unsigned OuterIdx = 0; OuterIdx != OuterNum; ++OuterIdx) + InlinedLPad->addClause(OuterLPad->getClause(OuterIdx)); + if (OuterLPad->isCleanup()) + InlinedLPad->setCleanup(true); + } + + // Forward the taskframe.resume. + TFResumeDest.forwardTaskResume(TFR); +} + +void llvm::InlineTaskFrameResumes(Value *TaskFrame, DominatorTree *DT) { + SmallVector TaskFrameResumes; + // Record all taskframe.resume markers that use TaskFrame. + for (User *U : TaskFrame->users()) + if (Instruction *I = dyn_cast(U)) + if (isTaskFrameResume(I)) + TaskFrameResumes.push_back(I); + + // Handle all taskframe.resume markers. + for (Instruction *TFR : TaskFrameResumes) + handleTaskFrameResume(TaskFrame, TFR, DT); +} + +static void startSerializingTaskFrame(Value *TaskFrame, + SmallVectorImpl &ToErase, + DominatorTree *DT, + bool PreserveTaskFrame) { + for (User *U : TaskFrame->users()) + if (Instruction *UI = dyn_cast(U)) + if (isTapirIntrinsic(Intrinsic::taskframe_use, UI)) + ToErase.push_back(UI); + + if (!PreserveTaskFrame) + InlineTaskFrameResumes(TaskFrame, DT); +} + +void llvm::SerializeDetach(DetachInst *DI, BasicBlock *ParentEntry, + BasicBlock *EHContinue, Value *LPadValInEHContinue, + SmallVectorImpl &Reattaches, + SmallVectorImpl *EHBlocksToClone, + SmallPtrSetImpl *EHBlockPreds, + SmallPtrSetImpl *InlinedLPads, + SmallVectorImpl *DetachedRethrows, + bool ReplaceWithTaskFrame, DominatorTree *DT, + LoopInfo *LI) { + BasicBlock *Spawner = DI->getParent(); + BasicBlock *TaskEntry = DI->getDetached(); + BasicBlock *Continue = DI->getContinue(); + BasicBlock *Unwind = DI->getUnwindDest(); + Value *SyncRegion = DI->getSyncRegion(); + Module *M = Spawner->getModule(); + + // If the spawned task has a taskframe, serialize the taskframe. + SmallVector ToErase; + Value *TaskFrame = getTaskFrameUsed(TaskEntry); + if (TaskFrame) + startSerializingTaskFrame(TaskFrame, ToErase, DT, ReplaceWithTaskFrame); + + // Clone any EH blocks that need cloning. + if (EHBlocksToClone) { + assert(EHBlockPreds && + "Given EH blocks to clone, but not blocks exiting to them."); + cloneEHBlocks(Spawner->getParent(), *EHBlocksToClone, *EHBlockPreds, ".sd", + InlinedLPads, DetachedRethrows, DT, LI); + } + + // Collect the exit points into a single vector. + SmallVector ExitPoints; + for (Instruction *Exit : Reattaches) + ExitPoints.push_back(Exit); + if (DetachedRethrows) + for (Instruction *Exit : *DetachedRethrows) + ExitPoints.push_back(Exit); + + // Move static alloca instructions in the task entry to the appropriate entry + // block. + bool ContainsDynamicAllocas = + MoveStaticAllocasInBlock(ParentEntry, TaskEntry, ExitPoints); + // If the cloned loop contained dynamic alloca instructions, wrap the inlined + // code with llvm.stacksave/llvm.stackrestore intrinsics. + if (ContainsDynamicAllocas) { + // Get the two intrinsics we care about. + Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave); + Function *StackRestore = + Intrinsic::getDeclaration(M, Intrinsic::stackrestore); + + // Insert the llvm.stacksave. + CallInst *SavedPtr = IRBuilder<>(TaskEntry, TaskEntry->begin()) + .CreateCall(StackSave, {}, "savedstack"); + + // Insert a call to llvm.stackrestore before the reattaches in the original + // Tapir loop. + for (Instruction *Exit : ExitPoints) + IRBuilder<>(Exit).CreateCall(StackRestore, SavedPtr); + } + + // If we're replacing the detach with a taskframe and we don't have a + // taskframe already, create one. + if (ReplaceWithTaskFrame) { + if (!TaskFrame) { + // Create a new task frame. + Function *TFCreate = + Intrinsic::getDeclaration(M, Intrinsic::taskframe_create); + TaskFrame = IRBuilder<>(TaskEntry, TaskEntry->begin()) + .CreateCall(TFCreate, {}, "repltf"); + } + } + + // Handle any detached-rethrows in the task. + bool HasUnwind = DI->hasUnwindDest(); + if (HasUnwind) { + assert(InlinedLPads && "Missing set of landing pads in task."); + assert(DetachedRethrows && "Missing set of detached rethrows in task."); + if (ReplaceWithTaskFrame) { + // If we're replacing the detach with a taskframe, simply replace the + // detached.rethrow intrinsics with taskframe.resume intrinsics. + for (Instruction *I : *DetachedRethrows) { + InvokeInst *II = cast(I); + Value *LPad = II->getArgOperand(1); + Function *TFResume = Intrinsic::getDeclaration( + M, Intrinsic::taskframe_resume, {LPad->getType()}); + IRBuilder<>(II).CreateInvoke(TFResume, II->getNormalDest(), + II->getUnwindDest(), {TaskFrame, LPad}); + II->eraseFromParent(); + } + } else { + // Otherwise, "inline" the detached landingpads. + handleDetachedLandingPads(DI, EHContinue, LPadValInEHContinue, + *InlinedLPads, *DetachedRethrows, DT); + } + } + + // Replace reattaches with unconditional branches to the continuation. + BasicBlock *ReattachDom = nullptr; + for (Instruction *I : Reattaches) { + assert(isa(I) && "Recorded reattach is not a reattach"); + assert(cast(I)->getSyncRegion() == SyncRegion && + "Reattach does not match sync region of detach."); + if (DT) { + if (!ReattachDom) + ReattachDom = I->getParent(); + else + ReattachDom = + DT->findNearestCommonDominator(ReattachDom, I->getParent()); + } + + // If we're replacing the detach with a taskframe, insert a taskframe.end + // immediately before the reattach. + if (ReplaceWithTaskFrame) { + Function *TFEnd = Intrinsic::getDeclaration(M, Intrinsic::taskframe_end); + IRBuilder<>(I).CreateCall(TFEnd, {TaskFrame}); + } + ReplaceInstWithInst(I, BranchInst::Create(Continue)); + } + + // Replace the detach with an unconditional branch to the task entry. + Continue->removePredecessor(Spawner); + if (HasUnwind) + Unwind->removePredecessor(Spawner); + ReplaceInstWithInst(DI, BranchInst::Create(TaskEntry)); + + // Erase instructions marked to be erased. + for (Instruction *I : ToErase) + I->eraseFromParent(); + + // Update dominator tree. + if (DT) { + if (ReattachDom && DT->dominates(Spawner, Continue)) + DT->changeImmediateDominator(Continue, ReattachDom); + if (HasUnwind) + DT->deleteEdge(Spawner, Unwind); + } +} + +/// Analyze a task for serialization +void llvm::AnalyzeTaskForSerialization( + Task *T, SmallVectorImpl &Reattaches, + SmallVectorImpl &EHBlocksToClone, + SmallPtrSetImpl &EHBlockPreds, + SmallPtrSetImpl &InlinedLPads, + SmallVectorImpl &DetachedRethrows) { + assert(!T->isRootTask() && "Cannot serialize root task."); + Value *SyncRegion = T->getDetach()->getSyncRegion(); + for (Spindle *S : depth_first>(T->getEntrySpindle())) { + // Look for landing pads in the task (and no subtask) to be merged with a + // spawner landing pad. + for (BasicBlock *BB : S->blocks()) { + // Record any shared-EH blocks that need to be cloned. + if (S->isSharedEH()) { + // Skip basic blocks that are placeholder successors + if (isPlaceholderSuccessor(BB)) + continue; + + EHBlocksToClone.push_back(BB); + if (S->getEntry() == BB) + for (BasicBlock *Pred : predecessors(BB)) + if (T->simplyEncloses(Pred)) + EHBlockPreds.insert(Pred); + } + + if (InvokeInst *II = dyn_cast(BB->getTerminator())) { + if (!isDetachedRethrow(BB->getTerminator(), SyncRegion)) { + assert(!isDetachedRethrow(BB->getTerminator()) && + "Detached rethrow in task does not match sync region."); + // Record this landing pad to merge with DI's landing pad. + InlinedLPads.insert(II->getLandingPadInst()); + } + } else if (DetachInst *SubDI = dyn_cast(BB->getTerminator())) + if (SubDI->hasUnwindDest()) + // Record this landing pad to merge with DI's landing pad. + InlinedLPads.insert(SubDI->getLandingPadInst()); + } + + if (!T->isTaskExiting(S)) + continue; + + // Find the reattach and detached-rethrow exits from this task. + for (BasicBlock *BB : S->blocks()) { + if (isa(BB->getTerminator())) { + assert(cast(BB->getTerminator())->getSyncRegion() == + SyncRegion && + "Reattach in task does not match sync region with detach."); + Reattaches.push_back(BB->getTerminator()); + } else if (InvokeInst *II = dyn_cast(BB->getTerminator())) { + if (isDetachedRethrow(II, SyncRegion)) + // Get detached rethrows in the task to forward. + DetachedRethrows.push_back(II); + } + } + } +} + +/// Serialize the detach DI that spawns task T. If provided, the dominator tree +/// DT will be updated to reflect the serialization. +void llvm::SerializeDetach(DetachInst *DI, Task *T, bool ReplaceWithTaskFrame, + DominatorTree *DT) { + assert(DI && "SerializeDetach given nullptr for detach."); + assert(DI == T->getDetach() && "Task and detach arguments do not match."); + SmallVector EHBlocksToClone; + SmallPtrSet EHBlockPreds; + SmallVector Reattaches; + SmallPtrSet InlinedLPads; + SmallVector DetachedRethrows; + + AnalyzeTaskForSerialization(T, Reattaches, EHBlocksToClone, EHBlockPreds, + InlinedLPads, DetachedRethrows); + BasicBlock *EHContinue = nullptr; + Value *LPadVal = nullptr; + if (DI->hasUnwindDest()) { + EHContinue = T->getEHContinuationSpindle()->getEntry(); + LPadVal = T->getLPadValueInEHContinuationSpindle(); + } + SerializeDetach(DI, T->getParentTask()->getEntry(), EHContinue, LPadVal, + Reattaches, &EHBlocksToClone, &EHBlockPreds, &InlinedLPads, + &DetachedRethrows, ReplaceWithTaskFrame, DT); +} + +static bool isCanonicalTaskFrameEnd(const Instruction *TFEnd) { + // Check that the last instruction in the basic block containing TFEnd is + // TFEnd. + const Instruction *Term = &TFEnd->getParent()->back(); + if (!Term || isa(Term) || isa(Term)) + return false; + + const Instruction *Prev = Term->getPrevNode(); + if (!Prev || Prev != TFEnd) + return false; + + return true; +} + +// Check if the basic block terminates a taskframe via a taskframe.end. +static bool endsUnassociatedTaskFrame(const BasicBlock *B) { + const Instruction *Prev = B->getTerminator()->getPrevNode(); + if (!Prev) + return false; + if (isTapirIntrinsic(Intrinsic::taskframe_end, Prev) && + isCanonicalTaskFrameEnd(Prev)) + return true; + return false; +} + +/// Checks if the given taskframe.create instruction is in canonical form. This +/// function mirrors the behavior of needToSplitTaskFrameCreate in +/// Transforms/Utils/TapirUtils. +static bool isCanonicalTaskFrameCreate(const Instruction *TFCreate) { + // If the taskframe.create is not the first instruction, split. + if (TFCreate != &TFCreate->getParent()->front()) + return false; + + // The taskframe.create is at the front of the block. Check that we have a + // single predecessor. + const BasicBlock *Pred = TFCreate->getParent()->getSinglePredecessor(); + if (!Pred) + return false; + + // Check that the single predecessor has a single successor. + if (!Pred->getSingleSuccessor()) + return false; + + // Check whether the single predecessor is terminated with a sync. + if (isa(Pred->getTerminator())) + return false; + + // If the taskframe.create has no users, ignore it. + if (TFCreate->user_empty()) + return false; + + // Check that the uses of the taskframe.create are canonical as well. + for (const User *U : TFCreate->users()) { + if (const Instruction *I = dyn_cast(U)) { + if (isTapirIntrinsic(Intrinsic::taskframe_use, I) || + isTapirIntrinsic(Intrinsic::taskframe_resume, I)) + return true; + if (isTapirIntrinsic(Intrinsic::taskframe_end, I)) + return isCanonicalTaskFrameEnd(I); + } + } + return true; +} + +static const Value *getCanonicalTaskFrameCreate(const BasicBlock *BB) { + if (const IntrinsicInst *II = dyn_cast(&BB->front())) + if (Intrinsic::taskframe_create == II->getIntrinsicID() && + isCanonicalTaskFrameCreate(II)) + return II; + return nullptr; +} + +/// GetDetachedCtx - Get the entry basic block to the detached context +/// that contains the specified block. +/// +BasicBlock *llvm::GetDetachedCtx(BasicBlock *BB) { + return const_cast( + GetDetachedCtx(const_cast(BB))); +} + +const BasicBlock *llvm::GetDetachedCtx(const BasicBlock *BB) { + // Traverse the CFG backwards until we either reach the entry block of the + // function or we find a detach instruction that detaches the current block. + SmallPtrSet Visited; + SmallVector WorkList; + SmallPtrSet TaskFramesToIgnore; + WorkList.push_back(BB); + while (!WorkList.empty()) { + const BasicBlock *CurrBB = WorkList.pop_back_val(); + if (!Visited.insert(CurrBB).second) + continue; + + // If we find a canonical taskframe.create that we're not ignoring, then + // we've found the context. + if (const Value *TaskFrame = getCanonicalTaskFrameCreate(CurrBB)) + if (!TaskFramesToIgnore.count(TaskFrame)) + return CurrBB; + + for (const BasicBlock *PredBB : predecessors(CurrBB)) { + // Skip predecessors via reattach instructions. The detacher block + // corresponding to this reattach is also a predecessor of the current + // basic block. + if (isa(PredBB->getTerminator())) + continue; + + // Skip predecessors via detach rethrows. + if (isDetachedRethrow(PredBB->getTerminator())) + continue; + + // If we find a taskframe.resume, add its taskframe to the set of + // taskframes to ignore. + if (isTaskFrameResume(PredBB->getTerminator())) { + const InvokeInst *II = cast(PredBB->getTerminator()); + TaskFramesToIgnore.insert(II->getArgOperand(0)); + } else if (endsUnassociatedTaskFrame(PredBB)) { + const CallBase *TFEnd = + cast(PredBB->getTerminator()->getPrevNode()); + TaskFramesToIgnore.insert(TFEnd->getArgOperand(0)); + } + + // If the predecessor is terminated by a detach, check to see if + // that detach spawned the current basic block. + if (isa(PredBB->getTerminator())) { + const DetachInst *DI = cast(PredBB->getTerminator()); + if (DI->getDetached() == CurrBB) + // Return the current block, which is the entry of this detached + // sub-CFG. + return CurrBB; + else if (const Value *SubTaskFrame = + getTaskFrameUsed(DI->getDetached())) + // Ignore this tasks's taskframe, if it has one. + TaskFramesToIgnore.insert(SubTaskFrame); + } + + // Otherwise, add the predecessor block to the work list to search. + WorkList.push_back(PredBB); + } + } + + // Our search didn't find anything, so return the entry of the function + // containing the given block. + return &(BB->getParent()->getEntryBlock()); +} + +// Returns true if the function may not be synced at the point of the given +// basic block, false otherwise. This function does a simple depth-first +// traversal of the CFG, and as such, produces a conservative result. +bool llvm::mayBeUnsynced(const BasicBlock *BB) { + SmallPtrSet Visited; + SmallVector WorkList; + SmallPtrSet TaskFramesToIgnore; + WorkList.push_back(BB); + while (!WorkList.empty()) { + const BasicBlock *CurrBB = WorkList.pop_back_val(); + if (!Visited.insert(CurrBB).second) + continue; + + // If we find a canonical taskframe.create that we're not ignoring, then + // we've found the context. + if (const Value *TaskFrame = getCanonicalTaskFrameCreate(CurrBB)) + if (!TaskFramesToIgnore.count(TaskFrame)) + continue; + + for (const BasicBlock *PredBB : predecessors(CurrBB)) { + // If we find a predecessor via reattach instructions, then + // wconservatively return that we may not be synced. + if (isa(PredBB->getTerminator())) + return true; + + // If we find a predecessor via a detached.rethrow, then conservatively + // return that we may not be synced. + if (isDetachedRethrow(PredBB->getTerminator())) + return true; + + // If we find a taskframe.resume, add its taskframe to the set of + // taskframes to ignore. + if (isTaskFrameResume(PredBB->getTerminator())) { + const InvokeInst *II = cast(PredBB->getTerminator()); + TaskFramesToIgnore.insert(II->getArgOperand(0)); + } else if (endsUnassociatedTaskFrame(PredBB)) { + const CallBase *TFEnd = + cast(PredBB->getTerminator()->getPrevNode()); + TaskFramesToIgnore.insert(TFEnd->getArgOperand(0)); + } + + // If the predecessor is terminated by a detach, check to see if + // that detach spawned the current basic block. + if (isa(PredBB->getTerminator())) { + const DetachInst *DI = cast(PredBB->getTerminator()); + if (DI->getDetached() != CurrBB) + // We encountered a continue or unwind destination of a detach. + // Conservatively return that we may not be synced. + return true; + } + + // Otherwise, add the predecessor block to the work list to search. + WorkList.push_back(PredBB); + } + } + return false; +} + +/// isDetachedContinueEdge - Return true if the edge from terminator instruction +/// TI to successor basic block Succ is a detach-continue edge. +bool llvm::isDetachContinueEdge(const Instruction *TI, const BasicBlock *Succ) { + if (isa(TI)) + return true; + if (isDetachedRethrow(TI)) + return Succ == cast(TI)->getUnwindDest(); + if (const DetachInst *DI = dyn_cast(TI)) + return Succ == DI->getContinue() || + (DI->hasUnwindDest() && Succ == DI->getUnwindDest()); + return false; +} + +/// isCriticalContinueEdge - Return true if the specified edge is a critical +/// detach-continue edge. Critical detach-continue edges are critical edges - +/// from a block with multiple successors to a block with multiple predecessors +/// - even after ignoring all reattach edges. +bool llvm::isCriticalContinueEdge(const Instruction *TI, unsigned SuccNum) { + assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!"); + if (TI->getNumSuccessors() == 1) + return false; + + // Edge must come from a detach. + if (!isa(TI)) + return false; + // Edge must go to the continuation. + if (SuccNum != 1) + return false; + + const BasicBlock *Dest = TI->getSuccessor(SuccNum); + const_pred_iterator I = pred_begin(Dest), E = pred_end(Dest); + + // If there is more than one predecessor, this is a critical edge... + assert(I != E && "No preds, but we have an edge to the block?"); + const BasicBlock *DetachPred = TI->getParent(); + for (; I != E; ++I) { + if (DetachPred == *I) + continue; + // Even if a reattach instruction isn't associated with the detach + // instruction TI, we can safely skip it, because it will be associated with + // a different detach instruction that precedes this block. + if (isa((*I)->getTerminator())) + continue; + return true; + } + return false; +} + +/// canDetach - Return true if the given function can perform a detach, false +/// otherwise. +bool llvm::canDetach(const Function *F) { + for (const BasicBlock &BB : *F) + if (isa(BB.getTerminator())) + return true; + return false; +} + +void llvm::GetDetachedCFG(const DetachInst &DI, const DominatorTree &DT, + SmallPtrSetImpl &TaskBlocks, + SmallPtrSetImpl &EHBlocks, + SmallPtrSetImpl &TaskReturns) { + SmallVector Todo; + SmallVector WorkListEH; + + LLVM_DEBUG(dbgs() << "Finding CFG detached by " << DI << "\n"); + + BasicBlock *Detached = DI.getDetached(); + BasicBlock *Continue = DI.getContinue(); + Value *SyncRegion = DI.getSyncRegion(); + BasicBlockEdge DetachEdge(DI.getParent(), Detached); + + Todo.push_back(Detached); + while (!Todo.empty()) { + BasicBlock *BB = Todo.pop_back_val(); + + if (!TaskBlocks.insert(BB).second) + continue; + + LLVM_DEBUG(dbgs() << " Found block " << BB->getName() << "\n"); + + Instruction *Term = BB->getTerminator(); + if (nullptr == Term) + llvm_unreachable("BB with null terminator found."); + + if (ReattachInst *RI = dyn_cast(Term)) { + // Either a reattach instruction terminates the detached CFG or it + // terminates a nested detached CFG. If it terminates a nested detached + // CFG, it can simply be ignored, because the corresponding nested detach + // instruction will be processed later. + if (RI->getDetachContinue() != Continue) + continue; + assert(RI->getSyncRegion() == SyncRegion && + "Reattach terminating detached CFG has nonmatching sync region."); + TaskReturns.insert(BB); + continue; + } else if (DetachInst *NestedDI = dyn_cast(Term)) { + assert(NestedDI != &DI && "Found recursive Detach"); + // Add the successors of the nested detach instruction for searching. + Todo.push_back(NestedDI->getDetached()); + Todo.push_back(NestedDI->getContinue()); + if (NestedDI->hasUnwindDest()) + Todo.push_back(NestedDI->getUnwindDest()); + continue; + } else if (SyncInst *SI = dyn_cast(Term)) { + // A sync instruction should only apply to nested detaches within this + // task. Hence it can be treated like a branch. + assert(SI->getSyncRegion() != SyncRegion && + "Sync in detached task applies to parent parallel context."); + Todo.push_back(SI->getSuccessor(0)); + continue; + } else if (isa(Term) || isa(Term) || + isa(Term)) { + if (isDetachedRethrow(Term, SyncRegion)) { + // A detached rethrow terminates this task and is included in the set of + // exception-handling blocks that might not be unique to this task. + LLVM_DEBUG(dbgs() << " Exit block " << BB->getName() << "\n"); + TaskReturns.insert(BB); + EHBlocks.insert(BB); + } else { + for (BasicBlock *Succ : successors(BB)) { + if (DT.dominates(DetachEdge, Succ)) { + LLVM_DEBUG(dbgs() + << "Adding successor " << Succ->getName() << "\n"); + Todo.push_back(Succ); + } else { + // We assume that this block is an exception-handling block and save + // it for later processing. + LLVM_DEBUG(dbgs() + << " Exit block to search " << Succ->getName() << "\n"); + EHBlocks.insert(Succ); + WorkListEH.push_back(Succ); + } + } + } + continue; + } else if (isa(Term)) { + // We don't bother cloning unreachable exits from the detached CFG at this + // point. We're cloning the entire detached CFG anyway when we outline + // the function. + continue; + } else { + llvm_unreachable( + "Detached task does not absolutely terminate in reattach"); + } + } + + // Find the exception-handling exit blocks. + { + SmallPtrSet Visited; + while (!WorkListEH.empty()) { + BasicBlock *BB = WorkListEH.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + // Make sure that the control flow through these exception-handling blocks + // cannot re-enter the blocks being outlined. + assert(!TaskBlocks.count(BB) && + "EH blocks for a detached task reenter that task."); + + // Make sure that the control flow through these exception-handling blocks + // doesn't perform an ordinary return or resume. + assert(!isa(BB->getTerminator()) && + "EH block terminated by return."); + assert(!isa(BB->getTerminator()) && + "EH block terminated by resume."); + + // Make sure that the control flow through these exception-handling blocks + // doesn't reattach to the detached CFG's continuation. + LLVM_DEBUG({ + if (ReattachInst *RI = dyn_cast(BB->getTerminator())) + assert(RI->getSuccessor(0) != Continue && + "Exit block reaches a reattach to the continuation."); + }); + + // Stop searching down this path upon finding a detached rethrow. + if (isDetachedRethrow(BB->getTerminator(), SyncRegion)) { + TaskReturns.insert(BB); + continue; + } + + for (BasicBlock *Succ : successors(BB)) { + EHBlocks.insert(Succ); + WorkListEH.push_back(Succ); + } + } + + // Visited now contains exception-handling blocks that we want to clone as + // part of outlining. + for (BasicBlock *EHBlock : Visited) + TaskBlocks.insert(EHBlock); + } + + LLVM_DEBUG({ + dbgs() << "Exit blocks:"; + for (BasicBlock *Exit : EHBlocks) { + if (DT.dominates(DetachEdge, Exit)) + dbgs() << "(dominated)"; + else + dbgs() << "(shared)"; + dbgs() << *Exit; + } + dbgs() << "\n"; + }); +} + +// Helper function to find PHI nodes that depend on the landing pad in the +// unwind destination of this task's detach. +void llvm::getDetachUnwindPHIUses(DetachInst *DI, + SmallPtrSetImpl &UnwindPHIs) { + // Get the landing pad of the unwind destination of the detach. + LandingPadInst *LPad = nullptr; + if (DI && DI->hasUnwindDest()) { + BasicBlock *UnwindDest = DI->getUnwindDest(); + LPad = UnwindDest->getLandingPadInst(); + assert(LPad && "Unwind of detach is not a landing pad."); + } + if (!LPad) + return; + + // Walk the chain of uses of this landing pad to find all PHI nodes that + // depend on it, directly or indirectly. + SmallVector WorkList; + SmallPtrSet Visited; + for (User *U : LPad->users()) + WorkList.push_back(U); + + while (!WorkList.empty()) { + User *Curr = WorkList.pop_back_val(); + if (!Visited.insert(Curr).second) + continue; + + // If we find a PHI-node user, add it to UnwindPHIs + if (PHINode *PN = dyn_cast(Curr)) + UnwindPHIs.insert(PN->getParent()); + + // Queue the successors for processing + for (User *U : Curr->users()) + WorkList.push_back(U); + } +} + +/// Return the taskframe used in the given detached block. +Value *llvm::getTaskFrameUsed(BasicBlock *Detached) { + // Scan the detached block for a taskframe.use intrinsic. If we find one, + // return its argument. + for (const Instruction &I : *Detached) + if (const IntrinsicInst *II = dyn_cast(&I)) + if (Intrinsic::taskframe_use == II->getIntrinsicID()) + return II->getArgOperand(0); + return nullptr; +} + +// Helper function to check if the given taskframe.create instruction requires +// the parent basic block to be split in order to canonicalize the +// representation of taskframes. +static bool needToSplitTaskFrameCreate(const Instruction *TFCreate) { + // If the taskframe.create is not the first instruction, split. + if (TFCreate != &TFCreate->getParent()->front()) + return true; + + // The taskframe.create is at the front of the block. Check that we have a + // single predecessor. + const BasicBlock *Pred = TFCreate->getParent()->getSinglePredecessor(); + if (!Pred) + return true; + + // Check that the single predecessor has a single successor. + if (!Pred->getSingleSuccessor()) + return true; + + // Check whether the single predecessor is terminated with a sync. + if (isa(Pred->getTerminator())) + return true; + + return false; +} + +// Helper function to check if the given taskframe.end instruction requires the +// parent basic block to be split in order to canonicalize the representation of +// taskframes. +static bool needToSplitTaskFrameEnd(const Instruction *TFEnd) { + const BasicBlock *B = TFEnd->getParent(); + // If the taskframe.end is not the penultimate instruction, split. + if (TFEnd != B->getTerminator()->getPrevNode()) + return true; + + // Check whether the parent block has a single successor. + const BasicBlock *Succ = B->getSingleSuccessor(); + if (!Succ) + return true; + + // Check that the single successor has a single predecessor. + if (!Succ->getSinglePredecessor()) + return true; + + // Check that the single successor is not a taskframe.create entry. + if (isTapirIntrinsic(Intrinsic::taskframe_create, &Succ->front())) + return true; + + // Check whether the parent block is terminated with a sync or a reattach. + if (isa(B->getTerminator()) || + isa(B->getTerminator())) + return true; + + return false; +} + +/// Split blocks in function F containing taskframe.create calls to canonicalize +/// the representation of Tapir taskframes in F. +bool llvm::splitTaskFrameCreateBlocks(Function &F, DominatorTree *DT, + TaskInfo *TI, LoopInfo *LI, + MemorySSAUpdater *MSSAU) { + if (F.empty()) + return false; + + // Scan the function for taskframe.create instructions to split. + SmallVector TFCreateToSplit; + SmallVector DetachesWithTaskFrames; + SmallVector TFEndToSplit; + SmallVector TFResumeToSplit; + SmallVector WorkList; + SmallPtrSet Visited; + WorkList.push_back(&F.getEntryBlock()); + while (!WorkList.empty()) { + BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + // Scan the instructions in BB for taskframe.create intrinsics. + for (Instruction &I : *BB) { + if (IntrinsicInst *II = dyn_cast(&I)) { + if (Intrinsic::taskframe_create == II->getIntrinsicID()) { + // Record this taskframe.create for splitting. + LLVM_DEBUG(dbgs() << "Pushing TFCreate " << *II << "\n"); + TFCreateToSplit.push_back(II); + + // Look for a detach instructions and taskframe.end intrinsics that + // use this taskframe. + for (User *U : II->users()) { + if (IntrinsicInst *UI = dyn_cast(U)) { + if (Intrinsic::taskframe_use == UI->getIntrinsicID()) { + if (BasicBlock *Pred = UI->getParent()->getSinglePredecessor()) + if (DetachInst *DI = + dyn_cast(Pred->getTerminator())) { + // Record this detach as using a taskframe. + DetachesWithTaskFrames.push_back(DI); + break; + } + } else if (Intrinsic::taskframe_end == UI->getIntrinsicID()) { + // Record this taskframe.end. + TFEndToSplit.push_back(UI); + } + } else if (Instruction *UI = dyn_cast(U)) { + if (isTaskFrameResume(UI, II)) { + // Record this taskframe.resume. + TFResumeToSplit.push_back(UI); + } + } + } + } + } + } + + // Add all successors of BB + for (BasicBlock *Succ : successors(BB)) + WorkList.push_back(Succ); + } + + bool Changed = false; + // Split the basic blocks containing taskframe.create calls so that the + // taskframe.create call starts the basic block. + for (Instruction *I : TFCreateToSplit) + if (needToSplitTaskFrameCreate(I)) { + LLVM_DEBUG(dbgs() << "Splitting at " << *I << "\n"); + StringRef OldName = I->getParent()->getName(); + SplitBlock(I->getParent(), I, DT, LI, MSSAU); + I->getParent()->setName(OldName + ".tf"); + Changed = true; + } + + // Split basic blocks containing taskframe.end calls, so that they end with an + // unconditional branch immediately after the taskframe.end call. + for (Instruction *TFEnd : TFEndToSplit) + if (needToSplitTaskFrameEnd(TFEnd)) { + LLVM_DEBUG(dbgs() << "Splitting block after " << *TFEnd << "\n"); + BasicBlock::iterator Iter = ++TFEnd->getIterator(); + SplitBlock(TFEnd->getParent(), &*Iter, DT, LI, MSSAU); + // Try to attach debug info to the new terminator after the taskframe.end + // call. + Instruction *SplitTerminator = TFEnd->getParent()->getTerminator(); + if (!SplitTerminator->getDebugLoc()) + SplitTerminator->setDebugLoc(TFEnd->getDebugLoc()); + Iter->getParent()->setName(TFEnd->getParent()->getName() + ".tfend"); + Changed = true; + } + + // Split critical continue edges, if we need to. For example, we need to + // split critical continue edges if we're planning to fixup external uses of + // variables defined in a taskframe. + // + // TODO: Predicate this canonicalization on something more intuitive than the + // existence of DT. + for (DetachInst *DI : DetachesWithTaskFrames) { + if (DT && isCriticalContinueEdge(DI, 1)) { + SplitCriticalEdge( + DI, 1, + CriticalEdgeSplittingOptions(DT, nullptr).setSplitDetachContinue()); + Changed = true; + } + } + // Similarly, split unwind edges from taskframe.resume's. + for (Instruction *TFResume : TFResumeToSplit) { + InvokeInst *II = cast(TFResume); + if (DT && isCriticalEdge(II, 1)) { + BasicBlock *Unwind = II->getUnwindDest(); + SplitBlockPredecessors(Unwind, {II->getParent()}, ".tfsplit", DT, LI, + MSSAU); + Changed = true; + } + } + + // Recalculate TaskInfo if necessary. + if (Changed && DT && TI) + TI->recalculate(F, *DT); + + return Changed; +} + +/// taskFrameContains - Returns true if the given basic block \p B is contained +/// within the taskframe \p TF. +bool llvm::taskFrameContains(const Spindle *TF, const BasicBlock *B, + const TaskInfo &TI) { + if (TF->getTaskFrameCreate()) { + if (TF->taskFrameContains(TI.getSpindleFor(B))) + return true; + } else { + // If TF is a task entry, check that that task encloses I's basic block. + return TF->getParentTask()->encloses(B); + } + return false; +} + +/// taskFrameEncloses - Returns true if the given basic block \p B is enclosed +/// within the taskframe \p TF. +bool llvm::taskFrameEncloses(const Spindle *TF, const BasicBlock *B, + const TaskInfo &TI) { + if (taskFrameContains(TF, B, TI)) + return true; + + if (!TF->getTaskFrameCreate()) + return false; + + // TF is a taskframe.create spindle. Recursively check its subtaskframes. + for (const Spindle *SubTF : TF->subtaskframes()) + if (taskFrameEncloses(SubTF, B, TI)) + return true; + + return false; +} + +/// fixupTaskFrameExternalUses - Fix any uses of variables defined in +/// taskframes, but outside of tasks themselves. For each such variable, insert +/// a memory allocation in the parent frame, add a store to that memory in the +/// taskframe, and modify external uses to use the value in that memory loaded +/// at the tasks continuation. +void llvm::fixupTaskFrameExternalUses(Spindle *TF, const TaskInfo &TI, + const DominatorTree &DT) { + Value *TaskFrame = TF->getTaskFrameCreate(); + if (!TaskFrame) + // Nothing to do for taskframe spindles that are actually task entries. + return; + Task *T = TF->getTaskFrameUser(); + + LLVM_DEBUG(dbgs() << "fixupTaskFrameExternalUses: spindle@" + << TF->getEntry()->getName() << "\n"); + LLVM_DEBUG({ + if (T) + dbgs() << " used by task@" << T->getEntry()->getName() << "\n"; + }); + + // Get the set of basic blocks in the taskframe spindles. At the same time, + // find the continuation of corresponding taskframe.resume intrinsics. + + SmallPtrSet BlocksToCheck; + BasicBlock *TFResumeContin = nullptr; + for (Spindle *S : TF->taskframe_spindles()) { + // Skip taskframe spindles within the task itself. + if (T && T->contains(S)) + continue; + for (BasicBlock *BB : S->blocks()) { + BlocksToCheck.insert(BB); + if (isTaskFrameResume(BB->getTerminator(), TaskFrame)) { + InvokeInst *TFResume = cast(BB->getTerminator()); + assert(((nullptr == TFResumeContin) || + (TFResumeContin == TFResume->getUnwindDest())) && + "Multiple taskframe.resume destinations found"); + TFResumeContin = TFResume->getUnwindDest(); + } + } + } + + BasicBlock *Continuation = TF->getTaskFrameContinuation(); + + MapVector> ToRewrite; + MapVector> SyncRegionsToLocalize; + // Find instructions in the taskframe that are used outside of the taskframe. + for (BasicBlock *BB : BlocksToCheck) { + for (Instruction &I : *BB) { + // Ignore certain instructions from consideration: the taskframe.create + // intrinsic for this taskframe, the detach instruction that spawns T, and + // the landingpad value in T's EH continuation. + if (T && ((T->getTaskFrameUsed() == &I) || (T->getDetach() == &I) || + (T->getLPadValueInEHContinuationSpindle() == &I))) + continue; + + // Examine all users of this instruction. + for (Use &U : I.uses()) { + // If we find a live use outside of the task, it's an output. + if (Instruction *UI = dyn_cast(U.getUser())) { + if (!taskFrameEncloses(TF, UI->getParent(), TI)) { + LLVM_DEBUG(dbgs() + << " ToRewrite: " << I << " (user " << *UI << ")\n"); + ToRewrite[&I].push_back(&U); + } + } + } + } + // Collect any syncregions used in this taskframe that are defined outside. + if (!T) { + if (DetachInst *DI = dyn_cast(BB->getTerminator())) + if (!taskFrameContains( + TF, cast(DI->getSyncRegion())->getParent(), TI)) { + LLVM_DEBUG(dbgs() + << " Sync region to localize: " << *DI->getSyncRegion() + << "(user " << *DI << ")\n"); + // Only record the detach. We can find associated reattaches and + // detached-rethrows later. + SyncRegionsToLocalize[DI->getSyncRegion()].push_back(DI); + } + + if (SyncInst *SI = dyn_cast(BB->getTerminator())) + if (!taskFrameContains( + TF, cast(SI->getSyncRegion())->getParent(), TI)) { + LLVM_DEBUG(dbgs() + << " Sync region to localize: " << *SI->getSyncRegion() + << "(user " << *SI << ")\n"); + SyncRegionsToLocalize[SI->getSyncRegion()].push_back(SI); + } + } + } + + Module *M = TF->getEntry()->getModule(); + + // Localize any syncregions used in this taskframe. + for (auto &SRUsed : SyncRegionsToLocalize) { + Value *ReplSR = CallInst::Create( + Intrinsic::getDeclaration(M, Intrinsic::syncregion_start), + SRUsed.first->getName(), cast(TaskFrame)->getNextNode()); + for (Instruction *UseToRewrite : SRUsed.second) { + // Replace the syncregion of each sync. + if (SyncInst *SI = dyn_cast(UseToRewrite)) { + SI->setSyncRegion(ReplSR); + // Replace the syncregion of each sync.unwind. + if (CallBase *CB = dyn_cast( + SI->getSuccessor(0)->getFirstNonPHIOrDbgOrLifetime())) + if (isSyncUnwind(CB, SRUsed.first)) + CB->setArgOperand(0, ReplSR); + } else if (DetachInst *DI = dyn_cast(UseToRewrite)) { + // Replace the syncregion of each detach. + DI->setSyncRegion(ReplSR); + Task *SubT = TI.getTaskFor(DI->getDetached()); + // Replace the syncregion of corresponding reattach instructions. + for (BasicBlock *Pred : predecessors(DI->getContinue())) + if (ReattachInst *RI = dyn_cast(Pred->getTerminator())) + if (SubT->encloses(Pred)) + RI->setSyncRegion(ReplSR); + + // Replace the syncregion of corresponding detached.rethrows. + for (User *U : SRUsed.first->users()) + if (InvokeInst *II = dyn_cast(U)) + if (isDetachedRethrow(II) && SubT->encloses(II->getParent())) + II->setArgOperand(0, ReplSR); + } + } + } + + // Rewrite any uses of values defined in the taskframe that are used outside. + for (auto &TFInstr : ToRewrite) { + LLVM_DEBUG(dbgs() << "Fixing taskframe output " << *TFInstr.first << "\n"); + // Create an allocation to store the result of the instruction. + BasicBlock *ParentEntry; + if (Spindle *ParentTF = TF->getTaskFrameParent()) + ParentEntry = ParentTF->getEntry(); + else + ParentEntry = TF->getParentTask()->getEntry(); + IRBuilder<> Builder(&*ParentEntry->getFirstInsertionPt()); + Type *TFInstrTy = TFInstr.first->getType(); + AllocaInst *AI = Builder.CreateAlloca(TFInstrTy); + AI->setName(TFInstr.first->getName()); + + // Store the result of the instruction into that alloca. + if (isa(TFInstr.first)) + Builder.SetInsertPoint( + &*TFInstr.first->getParent()->getFirstInsertionPt()); + else + Builder.SetInsertPoint(&*(++TFInstr.first->getIterator())); + Builder.CreateStore(TFInstr.first, AI); + + // Load the result of the instruction at the continuation. + Builder.SetInsertPoint(&*Continuation->getFirstInsertionPt()); + Builder.CreateCall(Intrinsic::getDeclaration( + M, Intrinsic::taskframe_load_guard, {AI->getType()}), + {AI}); + LoadInst *ContinVal = Builder.CreateLoad(TFInstrTy, AI); + LoadInst *EHContinVal = nullptr; + + // For each external use, replace the use with a load from the alloca. + for (Use *UseToRewrite : TFInstr.second) { + Instruction *User = cast(UseToRewrite->getUser()); + BasicBlock *UserBB = User->getParent(); + if (auto *PN = dyn_cast(User)) + UserBB = PN->getIncomingBlock(*UseToRewrite); + + if (!DT.dominates(Continuation, UserBB)) { + assert(DT.dominates(TFResumeContin, UserBB) && + "Use not dominated by continuation or taskframe.resume"); + // If necessary, load the value at the taskframe.resume continuation. + if (!EHContinVal) { + Builder.SetInsertPoint(&*(TFResumeContin->getFirstInsertionPt())); + Builder.CreateCall( + Intrinsic::getDeclaration(M, Intrinsic::taskframe_load_guard, + {AI->getType()}), + {AI}); + EHContinVal = Builder.CreateLoad(TFInstrTy, AI); + } + + // Rewrite to use the value loaded at the taskframe.resume continuation. + if (UseToRewrite->get()->hasValueHandle()) + ValueHandleBase::ValueIsRAUWd(*UseToRewrite, EHContinVal); + UseToRewrite->set(EHContinVal); + continue; + } + + // Rewrite to use the value loaded at the continuation. + if (UseToRewrite->get()->hasValueHandle()) + ValueHandleBase::ValueIsRAUWd(*UseToRewrite, ContinVal); + UseToRewrite->set(ContinVal); + } + } +} + +// Helper method to find a taskframe.create intrinsic in the given basic block. +Instruction *llvm::FindTaskFrameCreateInBlock(BasicBlock *BB, + const Value *TFToIgnore) { + for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) { + Instruction *I = &*BBI++; + + // Ignore TFToIgnore + if (TFToIgnore == I) + continue; + + // Check if this instruction is a call to taskframe_create. + if (CallInst *CI = dyn_cast(I)) + if (isTapirIntrinsic(Intrinsic::taskframe_create, I)) + return CI; + } + return nullptr; +} + +// Helper method to create an unwind edge for a nested taskframe or spawned +// task. This unwind edge is a new basic block terminated by an appropriate +// terminator, i.e., a taskframe.resume or detached.rethrow intrinsic. +BasicBlock *llvm::CreateSubTaskUnwindEdge(Intrinsic::ID TermFunc, Value *Token, + BasicBlock *UnwindEdge, + BasicBlock *Unreachable, + Instruction *ParentI) { + Function *Caller = UnwindEdge->getParent(); + Module *M = Caller->getParent(); + LandingPadInst *OldLPad = UnwindEdge->getLandingPadInst(); + + // Create a new unwind edge for the detached rethrow. + BasicBlock *NewUnwindEdge = + BasicBlock::Create(Caller->getContext(), UnwindEdge->getName(), Caller); + IRBuilder<> Builder(NewUnwindEdge); + // Get a debug location from ParentI. + if (const DebugLoc &Loc = ParentI->getDebugLoc()) + Builder.SetCurrentDebugLocation(Loc); + + // Add a landingpad to the new unwind edge. + LandingPadInst *LPad = + Builder.CreateLandingPad(OldLPad->getType(), 0, OldLPad->getName()); + LPad->setCleanup(true); + + // Add the terminator-function invocation. + Builder.CreateInvoke( + Intrinsic::getDeclaration(M, TermFunc, {LPad->getType()}), Unreachable, + UnwindEdge, {Token, LPad}); + + return NewUnwindEdge; +} + +static BasicBlock *MaybePromoteCallInBlock(BasicBlock *BB, + BasicBlock *UnwindEdge, + const Value *TaskFrame) { + for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) { + Instruction *I = &*BBI++; + + // We only need to check for function calls: inlined invoke + // instructions require no special handling. + CallInst *CI = dyn_cast(I); + + if (!CI || CI->isInlineAsm()) + continue; + + // Stop the search early if we encounter a taskframe.create or a + // taskframe.end. + if (isTapirIntrinsic(Intrinsic::taskframe_create, CI) || + (TaskFrame && + isTapirIntrinsic(Intrinsic::taskframe_end, CI, TaskFrame))) + return nullptr; + + // No need to transform calls that do not throw. + if (CI->doesNotThrow()) + continue; + // We cannot transform calls with musttail tag. + if (CI->isMustTailCall()) + continue; + + // We do not need to (and in fact, cannot) convert possibly throwing calls + // to @llvm.experimental_deoptimize (resp. @llvm.experimental.guard) into + // invokes. The caller's "segment" of the deoptimization continuation + // attached to the newly inlined @llvm.experimental_deoptimize + // (resp. @llvm.experimental.guard) call should contain the exception + // handling logic, if any. + if (auto *F = CI->getCalledFunction()) + if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize || + F->getIntrinsicID() == Intrinsic::experimental_guard) + continue; + + changeToInvokeAndSplitBasicBlock(CI, UnwindEdge); + return BB; + } + return nullptr; +} + +static Instruction *GetTaskFrameInstructionInBlock(BasicBlock *BB, + const Value *TaskFrame) { + for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) { + Instruction *I = &*BBI++; + + // We only need to check for function calls: inlined invoke + // instructions require no special handling. + CallInst *CI = dyn_cast(I); + + if (!CI || CI->isInlineAsm()) + continue; + + // Stop the search early if we encounter a taskframe.create or a + // taskframe.end. + if (isTapirIntrinsic(Intrinsic::taskframe_create, CI) && CI != TaskFrame) + return I; + if (TaskFrame && isTapirIntrinsic(Intrinsic::taskframe_end, CI, TaskFrame)) + return I; + } + return nullptr; +} + +// Recursively handle inlined tasks. +static void +PromoteCallsInTasksHelper(BasicBlock *EntryBlock, BasicBlock *UnwindEdge, + BasicBlock *Unreachable, Value *CurrentTaskFrame, + SmallVectorImpl *ParentWorklist, + SmallPtrSetImpl &Processed) { + SmallVector DetachesToReplace; + SmallVector Worklist; + // TODO: See if we need a global Visited set over all recursive calls, i.e., + // to handle shared exception-handling blocks. + SmallPtrSet Visited; + Worklist.push_back(EntryBlock); + do { + BasicBlock *BB = Worklist.pop_back_val(); + // Skip blocks we've seen before + if (!Visited.insert(BB).second) + continue; + + // Promote any calls in the block to invokes. + while (BasicBlock *NewBB = + MaybePromoteCallInBlock(BB, UnwindEdge, CurrentTaskFrame)) + BB = cast(NewBB->getTerminator())->getNormalDest(); + + Instruction *TFI = GetTaskFrameInstructionInBlock(BB, CurrentTaskFrame); + if (TFI && isTapirIntrinsic(Intrinsic::taskframe_create, TFI)) { + Processed.insert(BB); + Instruction *TFCreate = TFI; + if (TFCreate != CurrentTaskFrame) { + // Split the block at the taskframe.create, if necessary. + BasicBlock *NewBB; + if (TFCreate != &BB->front()) + NewBB = SplitBlock(BB, TFCreate); + else + NewBB = BB; + + // Create an unwind edge for the taskframe. + BasicBlock *TaskFrameUnwindEdge = + CreateSubTaskUnwindEdge(Intrinsic::taskframe_resume, TFCreate, + UnwindEdge, Unreachable, TFCreate); + + // Recursively check all blocks + PromoteCallsInTasksHelper(NewBB, TaskFrameUnwindEdge, Unreachable, + TFCreate, &Worklist, Processed); + + // Remove the unwind edge for the taskframe if it is not needed. + if (pred_empty(TaskFrameUnwindEdge)) + TaskFrameUnwindEdge->eraseFromParent(); + continue; + } + } else if (TFI && isTapirIntrinsic(Intrinsic::taskframe_end, TFI, + CurrentTaskFrame)) { + // If we find a taskframe.end in this block that ends the current + // taskframe, add this block to the parent search. + assert(ParentWorklist && "Unexpected taskframe.end: no parent worklist"); + if (BB->getTerminator()->getPrevNode() != TFI || + !isa(BB->getTerminator())) { + // This taskframe.end does not terminate the basic block. To make sure + // the rest of the block is processed properly, split the block. + BasicBlock *NewBB = SplitBlock(BB, TFI->getNextNode()); + ParentWorklist->push_back(NewBB); + } else { + // Add all successors of BB to the worklist. + for (BasicBlock *Successor : successors(BB)) + ParentWorklist->push_back(Successor); + } + continue; + } + + // Ignore reattach terminators. + if (isa(BB->getTerminator()) || + isDetachedRethrow(BB->getTerminator())) + continue; + + // If we find a taskframe.resume terminator, add its successor to the + // parent search. + if (isTaskFrameResume(BB->getTerminator())) { + assert(isTaskFrameResume(UnwindEdge->getTerminator()) && + "Unexpected taskframe.resume, doesn't correspond to unwind edge"); + InvokeInst *II = cast(BB->getTerminator()); + assert(ParentWorklist && + "Unexpected taskframe.resume: no parent worklist"); + ParentWorklist->push_back(II->getUnwindDest()); + continue; + } + + // Process a detach instruction specially. In particular, process th + // spawned task recursively. + if (DetachInst *DI = dyn_cast(BB->getTerminator())) { + Processed.insert(BB); + if (!DI->hasUnwindDest()) { + // Create an unwind edge for the subtask, which is terminated with a + // detached-rethrow. + BasicBlock *SubTaskUnwindEdge = CreateSubTaskUnwindEdge( + Intrinsic::detached_rethrow, DI->getSyncRegion(), UnwindEdge, + Unreachable, DI); + // Recursively check all blocks in the detached task. + PromoteCallsInTasksHelper(DI->getDetached(), SubTaskUnwindEdge, + Unreachable, CurrentTaskFrame, &Worklist, + Processed); + // If the new unwind edge is not used, remove it. + if (pred_empty(SubTaskUnwindEdge)) + SubTaskUnwindEdge->eraseFromParent(); + else + DetachesToReplace.push_back(DI); + + } else { + // Because this detach has an unwind destination, Any calls in the + // spawned task that may throw should already be invokes. Hence there + // is no need to promote calls in this task. + if (Visited.insert(DI->getUnwindDest()).second) + // If the detach-unwind isn't dead, add it to the worklist. + Worklist.push_back(DI->getUnwindDest()); + } + // Add the continuation to the worklist. + if (isTaskFrameResume(UnwindEdge->getTerminator()) && + (CurrentTaskFrame == getTaskFrameUsed(DI->getDetached()))) { + // This detach-continuation terminates the current taskframe, so push it + // onto the parent worklist. + assert(ParentWorklist && "Unexpected taskframe unwind edge"); + ParentWorklist->push_back(DI->getContinue()); + } else { + // We can process this detach-continuation directly, because it does not + // terminate the current taskframe. + Worklist.push_back(DI->getContinue()); + } + continue; + } + + // In the normal case, add all successors of BB to the worklist. + for (BasicBlock *Successor : successors(BB)) + Worklist.push_back(Successor); + + } while (!Worklist.empty()); + + // Replace detaches that now require unwind destinations. + while (!DetachesToReplace.empty()) { + DetachInst *DI = DetachesToReplace.pop_back_val(); + ReplaceInstWithInst(DI, + DetachInst::Create(DI->getDetached(), DI->getContinue(), + UnwindEdge, DI->getSyncRegion())); + } +} + +static FunctionCallee getDefaultPersonalityFn(Module *M) { + LLVMContext &C = M->getContext(); + Triple T(M->getTargetTriple()); + EHPersonality Pers = getDefaultEHPersonality(T); + return M->getOrInsertFunction(getEHPersonalityName(Pers), + FunctionType::get(Type::getInt32Ty(C), true)); +} + +void llvm::promoteCallsInTasksToInvokes(Function &F, const Twine Name) { + // Collect blocks to process, in order to handle unreachable blocks. + SmallVector ToProcess; + ToProcess.push_back(&F.getEntryBlock()); + for (BasicBlock &BB : F) { + Instruction *TFI = GetTaskFrameInstructionInBlock(&BB, nullptr); + if (TFI && isTapirIntrinsic(Intrinsic::taskframe_create, TFI)) + ToProcess.push_back(&BB); + + if (isa(BB.getTerminator())) + ToProcess.push_back(&BB); + } + + // Create a cleanup block. + LLVMContext &C = F.getContext(); + BasicBlock *CleanupBB = BasicBlock::Create(C, Name, &F); + Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C)); + + LandingPadInst *LPad = + LandingPadInst::Create(ExnTy, 1, Name + ".lpad", CleanupBB); + LPad->setCleanup(true); + ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB); + + // Create the normal return for the task resumes. + BasicBlock *UnreachableBlk = BasicBlock::Create(C, Name + ".unreachable", &F); + + // Recursively handle inlined tasks. + SmallPtrSet Processed; + for (BasicBlock *BB : ToProcess) { + if (!Processed.contains(BB)) + PromoteCallsInTasksHelper(BB, CleanupBB, UnreachableBlk, nullptr, nullptr, + Processed); + } + + // Either finish inserting the cleanup block (and associated data) or remove + // it, depending on whether it is used. + if (!pred_empty(CleanupBB)) { + if (!F.hasPersonalityFn()) { + FunctionCallee PersFn = getDefaultPersonalityFn(F.getParent()); + F.setPersonalityFn(cast(PersFn.getCallee())); + } + // Inherit debug info for the landingpad and resume in CleanupBB, if + // possible. + for (const BasicBlock *Pred : predecessors(CleanupBB)) + if (const DebugLoc &Loc = Pred->getTerminator()->getDebugLoc()) { + LPad->setDebugLoc(Loc); + RI->setDebugLoc(Loc); + break; + } + } else { + CleanupBB->eraseFromParent(); + } + + // Either finish the unreachable block or remove it, depending on whether it + // is used. + if (!pred_empty(UnreachableBlk)) { + IRBuilder<> Builder(UnreachableBlk); + Builder.CreateUnreachable(); + } else { + UnreachableBlk->eraseFromParent(); + } +} + +void llvm::eraseTaskFrame(Value *TaskFrame, DominatorTree *DT) { + InlineTaskFrameResumes(TaskFrame, DT); + SmallVector ToErase; + for (User *U : TaskFrame->users()) { + if (Instruction *UI = dyn_cast(U)) + if (isTapirIntrinsic(Intrinsic::taskframe_use, UI) || + isTapirIntrinsic(Intrinsic::taskframe_end, UI)) + ToErase.push_back(UI); + } + + for (Instruction *I : ToErase) + I->eraseFromParent(); + + cast(TaskFrame)->eraseFromParent(); +} + +/// Find hints specified in the loop metadata and update local values. +void llvm::TapirLoopHints::getHintsFromMetadata() { + MDNode *LoopID = TheLoop->getLoopID(); + if (!LoopID) + return; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + const MDString *S = nullptr; + SmallVector Args; + + // The expected hint is either a MDString or a MDNode with the first + // operand a MDString. + if (const MDNode *MD = dyn_cast(LoopID->getOperand(i))) { + if (!MD || MD->getNumOperands() == 0) + continue; + S = dyn_cast(MD->getOperand(0)); + for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) + Args.push_back(MD->getOperand(i)); + } else { + S = dyn_cast(LoopID->getOperand(i)); + assert(Args.size() == 0 && "too many arguments for MDString"); + } + + if (!S) + continue; + + // Check if the hint starts with the loop metadata prefix. + StringRef Name = S->getString(); + if (Args.size() == 1) + setHint(Name, Args[0]); + } +} + +/// Checks string hint with one operand and set value if valid. +void llvm::TapirLoopHints::setHint(StringRef Name, Metadata *Arg) { + if (!Name.startswith(Prefix())) + return; + Name = Name.substr(Prefix().size(), StringRef::npos); + + const ConstantInt *C = mdconst::dyn_extract(Arg); + if (!C) + return; + unsigned Val = C->getZExtValue(); + + Hint *Hints[] = {&Strategy, &Grainsize}; + for (auto H : Hints) { + if (Name == H->Name) { + if (H->validate(Val)) + H->Value = Val; + else + LLVM_DEBUG(dbgs() << "Tapir: ignoring invalid hint '" << Name << "'\n"); + break; + } + } +} + +/// Create a new hint from name / value pair. +MDNode *llvm::TapirLoopHints::createHintMetadata(StringRef Name, + unsigned V) const { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + Metadata *MDs[] = { + MDString::get(Context, Name), + ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))}; + return MDNode::get(Context, MDs); +} + +/// Matches metadata with hint name. +bool llvm::TapirLoopHints::matchesHintMetadataName( + MDNode *Node, ArrayRef HintTypes) const { + MDString *Name = dyn_cast(Node->getOperand(0)); + if (!Name) + return false; + + for (auto H : HintTypes) + if (Name->getString().endswith(H.Name)) + return true; + return false; +} + +/// Sets current hints into loop metadata, keeping other values intact. +void llvm::TapirLoopHints::writeHintsToMetadata(ArrayRef HintTypes) { + if (HintTypes.size() == 0) + return; + + LLVMContext &Context = TheLoop->getHeader()->getContext(); + SmallVector MDs; + + // Reserve first location for self reference to the LoopID metadata node. + TempMDTuple TempNode = MDNode::getTemporary(Context, std::nullopt); + MDs.push_back(TempNode.get()); + + // If the loop already has metadata, then ignore the existing operands. + MDNode *LoopID = TheLoop->getLoopID(); + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *Node = cast(LoopID->getOperand(i)); + // If node in update list, ignore old value. + if (!matchesHintMetadataName(Node, HintTypes)) + MDs.push_back(Node); + } + } + + // Now, add the missing hints. + for (auto H : HintTypes) + MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); + + // Replace current metadata node with new one. + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + + TheLoop->setLoopID(NewLoopID); +} + +/// Sets current hints into loop metadata, keeping other values intact. +void llvm::TapirLoopHints::writeHintsToClonedMetadata(ArrayRef HintTypes, + ValueToValueMapTy &VMap) { + if (HintTypes.size() == 0) + return; + + LLVMContext &Context = + cast(VMap[TheLoop->getHeader()])->getContext(); + SmallVector MDs; + + // Reserve first location for self reference to the LoopID metadata node. + TempMDTuple TempNode = MDNode::getTemporary(Context, std::nullopt); + MDs.push_back(TempNode.get()); + + // If the loop already has metadata, then ignore the existing operands. + MDNode *OrigLoopID = TheLoop->getLoopID(); + if (!OrigLoopID) + return; + + if (MDNode *LoopID = dyn_cast_or_null(VMap.MD()[OrigLoopID])) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *Node = cast(LoopID->getOperand(i)); + // If node in update list, ignore old value. + if (!matchesHintMetadataName(Node, HintTypes)) + MDs.push_back(Node); + } + } + + // Now, add the missing hints. + for (auto H : HintTypes) + MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); + + // Replace current metadata node with new one. + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + + // Set the metadata on the terminator of the cloned loop's latch. + BasicBlock *ClonedLatch = cast(VMap[TheLoop->getLoopLatch()]); + assert(ClonedLatch && "Cloned Tapir loop does not have a single latch."); + ClonedLatch->getTerminator()->setMetadata(LLVMContext::MD_loop, NewLoopID); +} + +/// Sets current hints into loop metadata, keeping other values intact. +void llvm::TapirLoopHints::clearHintsMetadata() { + Hint Hints[] = {Hint("spawn.strategy", ST_SEQ, HK_STRATEGY), + Hint("grainsize", 0, HK_GRAINSIZE)}; + LLVMContext &Context = TheLoop->getHeader()->getContext(); + SmallVector MDs; + + // Reserve first location for self reference to the LoopID metadata node. + TempMDTuple TempNode = MDNode::getTemporary(Context, std::nullopt); + MDs.push_back(TempNode.get()); + + // If the loop already has metadata, then ignore the existing operands. + MDNode *LoopID = TheLoop->getLoopID(); + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *Node = cast(LoopID->getOperand(i)); + // If node in update list, ignore old value. + if (!matchesHintMetadataName(Node, Hints)) + MDs.push_back(Node); + } + } + + // Replace current metadata node with new one. + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + + TheLoop->setLoopID(NewLoopID); +} + +/// Returns true if Tapir-loop hints require loop outlining during lowering. +bool llvm::hintsDemandOutlining(const TapirLoopHints &Hints) { + switch (Hints.getStrategy()) { + case TapirLoopHints::ST_DAC: + return true; + default: + return false; + } +} + +MDNode *llvm::CopyNonTapirLoopMetadata(MDNode *LoopID, MDNode *OrigLoopID) { + SmallVector MDs; + MDs.push_back(nullptr); + + // Gather all existing loop metadata. + if (LoopID) + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) + MDs.push_back(LoopID->getOperand(i)); + + // Inherit metadata from original loop. + for (const MDOperand &Existing : drop_begin(OrigLoopID->operands(), 1)) { + MDNode *Op = cast(Existing.get()); + + // Skip malformatted attribute metadata nodes. + if (Op->getNumOperands() == 0) + return nullptr; + Metadata *NameMD = Op->getOperand(0).get(); + if (!isa(NameMD)) + return nullptr; + StringRef AttrName = cast(NameMD)->getString(); + // Skip tapir.loop metadata + if (!AttrName.startswith("tapir.loop")) + MDs.push_back(Op); + } + + // Build the new loop ID. + MDTuple *NewLoopID = MDNode::get(OrigLoopID->getContext(), MDs); + NewLoopID->replaceOperandWith(0, NewLoopID); + return NewLoopID; +} + +/// Examine a given loop to determine if it is a Tapir loop. Returns the Task +/// that encodes the loop body if so, or nullptr if not. +Task *llvm::getTaskIfTapirLoop(const Loop *L, TaskInfo *TI) { + if (!L || !TI) + return nullptr; + + TapirLoopHints Hints(L); + + LLVM_DEBUG(dbgs() << "Loop hints:" + << " strategy = " + << Hints.printStrategy(Hints.getStrategy()) + << " grainsize = " << Hints.getGrainsize() << "\n"); + + // Check that this loop has the structure of a Tapir loop. + Task *T = getTaskIfTapirLoopStructure(L, TI); + if (!T) + return nullptr; + + // Check that the loop hints require this loop to be outlined. + if (!hintsDemandOutlining(Hints)) + return nullptr; + + return T; +} diff --git a/llvm/lib/Transforms/Utils/TaskCanonicalize.cpp b/llvm/lib/Transforms/Utils/TaskCanonicalize.cpp new file mode 100644 index 000000000000000..c73344d52a905e7 --- /dev/null +++ b/llvm/lib/Transforms/Utils/TaskCanonicalize.cpp @@ -0,0 +1,71 @@ +//===- TaskCanonicalize.cpp - Tapir task simplification pass ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass canonicalizes Tapir tasks, in particular, to split blocks at +// taskframe.create intrinsics. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/TaskCanonicalize.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "task-canonicalize" + +namespace { +struct TaskCanonicalize : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + TaskCanonicalize() : FunctionPass(ID) { + initializeTaskCanonicalizePass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved(); + } +}; +} + +char TaskCanonicalize::ID = 0; +INITIALIZE_PASS_BEGIN(TaskCanonicalize, "task-canonicalize", + "Canonicalize Tapir tasks", false, false) +INITIALIZE_PASS_END(TaskCanonicalize, "task-canonicalize", + "Canonicalize Tapir tasks", false, false) + +namespace llvm { +Pass *createTaskCanonicalizePass() { return new TaskCanonicalize(); } +} // end namespace llvm + +/// runOnFunction - Run through all tasks in the function and canonicalize. +bool TaskCanonicalize::runOnFunction(Function &F) { + if (F.empty()) + return false; + + LLVM_DEBUG(dbgs() << "TaskCanonicalize running on function " << F.getName() + << "\n"); + + return splitTaskFrameCreateBlocks(F); +} + +PreservedAnalyses TaskCanonicalizePass::run(Function &F, + FunctionAnalysisManager &AM) { + if (F.empty()) + return PreservedAnalyses::all(); + + LLVM_DEBUG(dbgs() << "TaskCanonicalize running on function " << F.getName() + << "\n"); + + bool Changed = splitTaskFrameCreateBlocks(F); + if (!Changed) + return PreservedAnalyses::all(); + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/Transforms/Utils/TaskSimplify.cpp b/llvm/lib/Transforms/Utils/TaskSimplify.cpp new file mode 100644 index 000000000000000..795780acad63a96 --- /dev/null +++ b/llvm/lib/Transforms/Utils/TaskSimplify.cpp @@ -0,0 +1,702 @@ +//===- TaskSimplify.cpp - Tapir task simplification pass ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass performs several transformations to simplify Tapir tasks. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/TaskSimplify.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TapirTaskInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "task-simplify" + +// Statistics +STATISTIC(NumUniqueSyncRegs, "Number of unique sync regions found."); +STATISTIC(NumDiscriminatingSyncs, "Number of discriminating syncs found."); +STATISTIC(NumTaskFramesErased, "Number of taskframes erased"); +STATISTIC( + NumTaskFramesConverted, + "Number of taskframes converted to stacksave and stackrestore intrinsics"); +STATISTIC(NumSimpl, "Number of blocks simplified"); + +static cl::opt SimplifyTaskFrames( + "simplify-taskframes", cl::init(true), cl::Hidden, + cl::desc("Enable simplification of taskframes.")); + +static cl::opt PostCleanupCFG( + "post-cleanup-cfg", cl::init(true), cl::Hidden, + cl::desc("Cleanup the CFG after task simplification.")); + +static cl::opt PreserveAllSpawns( + "tasksimplify-preserve-all-spawns", cl::init(false), cl::Hidden, + cl::desc("Temporary development switch to ensure TaskSimplify does not " + "eliminate spawns that immediately sync.")); + +static bool syncMatchesReachingTask(const Value *SyncSR, + SmallPtrSetImpl &MPTasks) { + if (MPTasks.empty()) + return false; + for (const Task *MPTask : MPTasks) + if (SyncSR == MPTask->getDetach()->getSyncRegion()) + return true; + return false; +} + +static bool removeRedundantSyncs(MaybeParallelTasks &MPTasks, Task *T) { + // Skip tasks with no subtasks. + if (T->isSerial()) + return false; + + bool Changed = false; + SmallPtrSet RedundantSyncs; + for (Spindle *S : T->spindles()) + // Iterate over outgoing edges of S to find redundant syncs. + for (Spindle::SpindleEdge &Edge : S->out_edges()) + if (SyncInst *Y = dyn_cast(Edge.second->getTerminator())) + if (!syncMatchesReachingTask(Y->getSyncRegion(), MPTasks.TaskList[S])) { + LLVM_DEBUG(dbgs() << "Found redundant sync in spindle " << *S << + "\n"); + RedundantSyncs.insert(Y); + } + + // Replace all unnecesary syncs with unconditional branches. + SmallPtrSet MaybeDeadSyncUnwinds; + for (SyncInst *Y : RedundantSyncs) { + // Check for any sync.unwinds that might now be dead. + Instruction *MaybeSyncUnwind = + Y->getSuccessor(0)->getFirstNonPHIOrDbgOrLifetime(); + if (isSyncUnwind(MaybeSyncUnwind, Y->getSyncRegion())) + MaybeDeadSyncUnwinds.insert(cast(MaybeSyncUnwind)); + + LLVM_DEBUG(dbgs() << "Removing redundant sync " << *Y << "\n"); + ReplaceInstWithInst(Y, BranchInst::Create(Y->getSuccessor(0))); + } + // Remove any dead sync.unwinds. + for (CallBase *CB : MaybeDeadSyncUnwinds) { + LLVM_DEBUG(dbgs() << "Remove dead sync unwind " << *CB << "? "); + if (removeDeadSyncUnwind(CB)) + LLVM_DEBUG(dbgs() << "Yes.\n"); + else + LLVM_DEBUG(dbgs() << "No.\n"); + } + + Changed |= !RedundantSyncs.empty(); + + return Changed; +} + +static bool syncIsDiscriminating(const Value *SyncSR, + SmallPtrSetImpl &MPTasks) { + for (const Task *MPTask : MPTasks) + if (SyncSR != MPTask->getDetach()->getSyncRegion()) + return true; + return false; +} + +static bool removeRedundantSyncRegions(MaybeParallelTasks &MPTasks, Task *T) { + if (T->isSerial()) + return false; + + // Create filter for MPTasks of tasks from parent of T. + SmallPtrSet EntryTaskList; + for (const Task *MPTask : MPTasks.TaskList[T->getEntrySpindle()]) + EntryTaskList.insert(MPTask); + + // Find the unique sync regions in this task. + SmallPtrSet UniqueSyncRegs; + Instruction *FirstSyncRegion = nullptr; + for (Task *SubT : T->subtasks()) { + UniqueSyncRegs.insert(SubT->getDetach()->getSyncRegion()); + if (!FirstSyncRegion) + FirstSyncRegion = cast( + SubT->getDetach()->getSyncRegion()); + } + NumUniqueSyncRegs += UniqueSyncRegs.size(); + // Skip this task if there's only one unique sync region. + if (UniqueSyncRegs.size() < 2) + return false; + + bool Changed = false; + SmallPtrSet NonRedundantSyncRegs; + for (Spindle *S : T->spindles()) { + // Only consider spindles that might have tasks in parallel. + if (MPTasks.TaskList[S].empty()) continue; + + // Filter the task list of S to exclude tasks in parallel with the entry. + SmallPtrSet LocalTaskList; + for (const Task *MPTask : MPTasks.TaskList[S]) + if (!EntryTaskList.count(MPTask)) + LocalTaskList.insert(MPTask); + if (LocalTaskList.empty()) continue; + + // Iterate over outgoing edges of S to find discriminating syncs. + for (Spindle::SpindleEdge &Edge : S->out_edges()) + if (const SyncInst *Y = dyn_cast(Edge.second->getTerminator())) + if (syncIsDiscriminating(Y->getSyncRegion(), LocalTaskList)) { + ++NumDiscriminatingSyncs; + LLVM_DEBUG(dbgs() << "Found discriminating sync " << *Y << "\n"); + NonRedundantSyncRegs.insert(Y->getSyncRegion()); + for (const Task *MPTask : LocalTaskList) + NonRedundantSyncRegs.insert(MPTask->getDetach()->getSyncRegion()); + } + } + + // Replace all redundant sync regions with the first sync region. + for (Value *SR : UniqueSyncRegs) { + if (!NonRedundantSyncRegs.count(SR) && SR != FirstSyncRegion) { + LLVM_DEBUG(dbgs() << "Replacing " << *SR << " with " << *FirstSyncRegion + << "\n"); + Changed = true; + SR->replaceAllUsesWith(FirstSyncRegion); + // Ensure that the first sync region is in the entry block of T. + if (FirstSyncRegion->getParent() != T->getEntry()) + FirstSyncRegion->moveAfter(&*T->getEntry()->getFirstInsertionPt()); + } + } + + return Changed; +} + +bool llvm::simplifySyncs(Task *T, MaybeParallelTasks &MPTasks) { + bool Changed = false; + + LLVM_DEBUG(dbgs() << "Simplifying syncs in task @ " + << T->getEntry()->getName() << "\n"); + + // Remove redundant syncs. This optimization might not be necessary here, + // because SimplifyCFG seems to do a good job removing syncs that cannot sync + // anything. + Changed |= removeRedundantSyncs(MPTasks, T); + + // Remove redundant sync regions. + Changed |= removeRedundantSyncRegions(MPTasks, T); + + return Changed; +} + +static bool taskCanThrow(const Task *T) { + for (const Spindle *S : T->spindles()) + for (const BasicBlock *BB : S->blocks()) + if (isa(BB->getTerminator())) + return true; + return false; +} + +static bool taskCanReachContinuation(Task *T) { + if (T->isRootTask()) + return true; + + DetachInst *DI = T->getDetach(); + BasicBlock *Continue = DI->getContinue(); + for (BasicBlock *Pred : predecessors(Continue)) { + if (ReattachInst *RI = dyn_cast(Pred->getTerminator())) + if (T->encloses(RI->getParent())) + return true; + } + + return false; +} + +static bool detachImmediatelySyncs(DetachInst *DI) { + Instruction *I = DI->getContinue()->getFirstNonPHIOrDbgOrLifetime(); + return isa(I); +} + +bool llvm::simplifyTask(Task *T) { + if (T->isRootTask()) + return false; + + LLVM_DEBUG(dbgs() << "Simplifying task @ " << T->getEntry()->getName() + << "\n"); + + bool Changed = false; + DetachInst *DI = T->getDetach(); + + bool NestedSync = taskContainsSync(T); + + // If T's detach has an unwind dest and T cannot throw, remove the unwind + // destination from T's detach. + if (DI->hasUnwindDest()) { + if (!taskCanThrow(T)) { + removeUnwindEdge(DI->getParent()); + // removeUnwindEdge will invalidate the DI pointer. Get the new DI + // pointer. + DI = T->getDetach(); + Changed = true; + } + } + + if (!taskCanReachContinuation(T)) { + // This optimization assumes that if a task cannot reach its continuation + // then we shouldn't bother spawning it. The task might perform code that + // can reach the unwind destination, however. + SerializeDetach(DI, T, NestedSync); + Changed = true; + } else if (!PreserveAllSpawns && detachImmediatelySyncs(DI)) { + SerializeDetach(DI, T, NestedSync); + Changed = true; + } + + return Changed; +} + +static bool canRemoveTaskFrame(const Spindle *TF, MaybeParallelTasks &MPTasks, + bool &TaskFrameContainsAlloca) { + Value *TFCreate = TF->getTaskFrameCreate(); + if (!TFCreate) + // Ignore implicit taskframes created from the start of a task that does not + // explicitly use another taskframe. + return false; + + // We can remove a taskframe if it does not allocate any stack storage of its + // own and it does not contain any distinguishing syncs. + + // We only need to check the spindles in the taskframe itself for these + // properties. We do not need to check the task that uses this taskframe. + const Task *UserT = TF->getTaskFromTaskFrame(); + + if (!UserT && !MPTasks.TaskList[TF].empty() && getTaskFrameResume(TFCreate)) + // Landingpads perform an implicit sync, so if there are logically parallel + // tasks with this unassociated taskframe and it has a resume destination, + // then it has a distinguishing sync. + return false; + + // Create filter for MPTasks of tasks from parent of task UserT, if UserT + // exists. + SmallPtrSet EntryTaskList; + if (UserT) + for (const Task *MPTask : MPTasks.TaskList[UserT->getEntrySpindle()]) + EntryTaskList.insert(MPTask); + + for (const Spindle *S : TF->taskframe_spindles()) { + // Skip spindles in the user task. + if (UserT && UserT->contains(S)) + continue; + + // Skip spindles that are placeholders. + if (isPlaceholderSuccessor(S->getEntry())) + continue; + + // Skip spindles in nested taskframes. + if (S != TF && S->getTaskFrameParent() != TF) + continue; + + // Filter the task list of S to exclude tasks in parallel with the entry. + SmallPtrSet LocalTaskList; + for (const Task *MPTask : MPTasks.TaskList[S]) + if (!EntryTaskList.count(MPTask)) + LocalTaskList.insert(MPTask); + + for (const BasicBlock *BB : S->blocks()) { + // If the taskframe contains an alloca, then we can replace it with + // stacksave and stackrestore intrinsics if there is no associated task. + // Otherwise, we cannot remove the taskframe. + for (const Instruction &I : *BB) { + if (isa(I)) { + TaskFrameContainsAlloca = true; + if (UserT) + return false; + } + } + + // We cannot remove taskframes that contain discriminating syncs. Doing + // so would cause these syncs to sync tasks spawned in the parent + // taskframe. + if (const SyncInst *SI = dyn_cast(BB->getTerminator())) + if (syncIsDiscriminating(SI->getSyncRegion(), LocalTaskList)) + return false; + } + } + + return true; +} + +static bool skipForHoisting(const Instruction *I, + SmallPtrSetImpl &NotHoisted) { + if (I->isTerminator() || isTapirIntrinsic(Intrinsic::taskframe_create, I) || + isTapirIntrinsic(Intrinsic::syncregion_start, I) || + isa(I)) + return true; + + if (const CallInst *CI = dyn_cast(I)) + if (!(CI->doesNotAccessMemory() || CI->onlyAccessesArgMemory())) + return true; + + for (const Value *V : I->operand_values()) + if (const Instruction *I = dyn_cast(V)) + if (NotHoisted.count(I)) + return true; + + return false; +} + +static bool hoistOutOfTaskFrame(Instruction *TFCreate) { + bool Changed = false; + + BasicBlock *Entry = TFCreate->getParent(); + // We'll move instructions immediately before the taskframe.create + // instruction. + BasicBlock::iterator InsertPoint = Entry->begin(); + + // Scan the instructions in the entry block and find instructions to hoist + // before the taskframe.create. + SmallPtrSet NotHoisted; + for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ) { + Instruction *Start = &*I++; + if (skipForHoisting(Start, NotHoisted)) { + NotHoisted.insert(Start); + continue; + } + + while (!skipForHoisting(&*I, NotHoisted)) + ++I; + + // Move the instructions + Entry->splice(InsertPoint, &*Entry, Start->getIterator(), I); + + Changed = true; + } + + return Changed; +} + +bool llvm::simplifyTaskFrames(TaskInfo &TI, DominatorTree &DT) { + // We compute maybe-parallel tasks here, to ensure the analysis is properly + // discarded if the CFG changes. + MaybeParallelTasks MPTasks; + TI.evaluateParallelState(MPTasks); + + bool Changed = false; + + // Get the set of taskframes we can erase. + SmallVector TaskFramesToErase; + SmallVector TaskFramesToConvert; + SmallVector TaskFramesToOptimize; + for (Spindle *TFRoot : TI.getRootTask()->taskframe_roots()) { + for (Spindle *TF : post_order>(TFRoot)) { + bool TaskFrameContainsAlloca = false; + if (canRemoveTaskFrame(TF, MPTasks, TaskFrameContainsAlloca)) { + if (TaskFrameContainsAlloca) + TaskFramesToConvert.push_back( + cast(TF->getTaskFrameCreate())); + else + TaskFramesToErase.push_back( + cast(TF->getTaskFrameCreate())); + } else if (Value *TFCreate = TF->getTaskFrameCreate()) + TaskFramesToOptimize.push_back(cast(TFCreate)); + } + } + + // First handle hoisting instructions out of a taskframe entry block, since + // this transformation does not change the CFG. + for (Instruction *TFCreate : TaskFramesToOptimize) { + LLVM_DEBUG(dbgs() << "Hoisting instructions out of taskframe " << *TFCreate + << "\n"); + Changed |= hoistOutOfTaskFrame(TFCreate); + } + + // Now delete any taskframes we don't need. + for (Instruction *TFCreate : TaskFramesToConvert) { + LLVM_DEBUG(dbgs() << "Converting taskframe " << *TFCreate << "\n"); + Module *M = TFCreate->getModule(); + Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave); + Function *StackRestore = + Intrinsic::getDeclaration(M, Intrinsic::stackrestore); + + // Save the stack at the point of the taskframe.create. + CallInst *SavedPtr = + IRBuilder<>(TFCreate).CreateCall(StackSave, {}, "savedstack.ts"); + + for (User *U : TFCreate->users()) { + if (Instruction *UI = dyn_cast(U)) { + // Restore the stack at each end of the taskframe. + if (isTapirIntrinsic(Intrinsic::taskframe_end, UI) || + isTapirIntrinsic(Intrinsic::taskframe_resume, UI)) + IRBuilder<>(UI).CreateCall(StackRestore, SavedPtr); + } + } + // Remove the taskframe. + eraseTaskFrame(TFCreate, &DT); + ++NumTaskFramesConverted; + Changed = true; + } + for (Instruction *TFCreate : TaskFramesToErase) { + LLVM_DEBUG(dbgs() << "Removing taskframe " << *TFCreate << "\n"); + eraseTaskFrame(TFCreate, &DT); + ++NumTaskFramesErased; + Changed = true; + } + + return Changed; +} + + +/// Call SimplifyCFG on all the blocks in the function, +/// iterating until no more changes are made. +static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, + DomTreeUpdater *DTU, + const SimplifyCFGOptions &Options) { + bool Changed = false; + bool LocalChange = true; + + SmallVector, 32> Edges; + FindFunctionBackedges(F, Edges); + SmallPtrSet UniqueLoopHeaders; + for (unsigned i = 0, e = Edges.size(); i != e; ++i) + UniqueLoopHeaders.insert(const_cast(Edges[i].second)); + + SmallVector LoopHeaders(UniqueLoopHeaders.begin(), + UniqueLoopHeaders.end()); + + while (LocalChange) { + LocalChange = false; + + // Loop over all of the basic blocks and remove them if they are unneeded. + for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { + BasicBlock &BB = *BBIt++; + if (DTU) { + assert( + !DTU->isBBPendingDeletion(&BB) && + "Should not end up trying to simplify blocks marked for removal."); + // Make sure that the advanced iterator does not point at the blocks + // that are marked for removal, skip over all such blocks. + while (BBIt != F.end() && DTU->isBBPendingDeletion(&*BBIt)) + ++BBIt; + } + if (simplifyCFG(&BB, TTI, DTU, Options, LoopHeaders)) { + LocalChange = true; + ++NumSimpl; + } + } + Changed |= LocalChange; + } + return Changed; +} + +static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI, + DominatorTree *DT, + const SimplifyCFGOptions &Options) { + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + + bool EverChanged = removeUnreachableBlocks(F, DT ? &DTU : nullptr); + EverChanged |= iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options); + + // If neither pass changed anything, we're done. + if (!EverChanged) return false; + + // iterativelySimplifyCFG can (rarely) make some loops dead. If this happens, + // removeUnreachableBlocks is needed to nuke them, which means we should + // iterate between the two optimizations. We structure the code like this to + // avoid rerunning iterativelySimplifyCFG if the second pass of + // removeUnreachableBlocks doesn't do anything. + if (!removeUnreachableBlocks(F, DT ? &DTU : nullptr)) + return true; + + do { + EverChanged = iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options); + EverChanged |= removeUnreachableBlocks(F, DT ? &DTU : nullptr); + } while (EverChanged); + + return true; +} + +namespace { +struct TaskSimplify : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + TaskSimplify() : FunctionPass(ID) { + initializeTaskSimplifyPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + } +}; +} + +char TaskSimplify::ID = 0; +INITIALIZE_PASS_BEGIN(TaskSimplify, "task-simplify", + "Simplify Tapir tasks", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TaskInfoWrapperPass) +INITIALIZE_PASS_END(TaskSimplify, "task-simplify", + "Simplify Tapir tasks", false, false) + +namespace llvm { +Pass *createTaskSimplifyPass() { return new TaskSimplify(); } +} // end namespace llvm + +/// runOnFunction - Run through all tasks in the function and simplify them in +/// post order. +/// +bool TaskSimplify::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + DominatorTree &DT = getAnalysis().getDomTree(); + TaskInfo &TI = getAnalysis().getTaskInfo(); + bool SplitBlocks = splitTaskFrameCreateBlocks(F, &DT, &TI); + TI.findTaskFrameTree(); + if (TI.isSerial() && !TI.foundChildTaskFrames()) + return false; + + SimplifyCFGOptions Options; + auto &TTI = getAnalysis().getTTI(F); + Options.AC = &getAnalysis().getAssumptionCache(F); + + bool Changed = false; + LLVM_DEBUG(dbgs() << "TaskSimplify running on function " << F.getName() + << "\n"); + + if (SimplifyTaskFrames) { + // Simplify taskframes. If anything changed, update the analysis. + Changed |= simplifyTaskFrames(TI, DT); + if (Changed) { + TI.recalculate(F, DT); + if (TI.isSerial()) { + if (PostCleanupCFG && SplitBlocks) + simplifyFunctionCFG(F, TTI, &DT, Options); + return Changed; + } + } + } + + // Evaluate the tasks that might be in parallel with each spindle, and + // determine number of discriminating syncs: syncs that sync a subset of the + // detached tasks, based on sync regions. + MaybeParallelTasks MPTasks; + TI.evaluateParallelState(MPTasks); + + // Simplify syncs in each task in the function. + for (Task *T : post_order(TI.getRootTask())) + Changed |= simplifySyncs(T, MPTasks); + + // Simplify each task in the function. + for (Task *T : post_order(TI.getRootTask())) + Changed |= simplifyTask(T); + + if (PostCleanupCFG && (Changed | SplitBlocks)) + Changed |= simplifyFunctionCFG(F, TTI, nullptr, Options); + + return Changed; +} + +PreservedAnalyses TaskSimplifyPass::run(Function &F, + FunctionAnalysisManager &AM) { + if (F.empty()) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + DominatorTree &DT = AM.getResult(F); + TaskInfo &TI = AM.getResult(F); + LoopInfo *LI = AM.getCachedResult(F); + auto *MSSAAnalysis = AM.getCachedResult(F); + std::unique_ptr MSSAU; + if (MSSAAnalysis) { + auto *MSSA = &MSSAAnalysis->getMSSA(); + MSSAU = std::make_unique(MSSA); + } + + bool SplitBlocks = splitTaskFrameCreateBlocks(F, &DT, &TI, LI, MSSAU.get()); + TI.findTaskFrameTree(); + // Return early if there are no Tapir tasks or taskframes to simplify. + if (TI.isSerial() && !TI.foundChildTaskFrames()) { + // If we didn't event split taskframe.create blocks, all analyses are + // preserved. + if (!SplitBlocks) + return PreservedAnalyses::all(); + + // Identify passes preserved by splitTaskFrameCreateBlocks. + PA.preserve(); + PA.preserve(); + PA.preserve(); + if (LI) + PA.preserve(); + if (MSSAAnalysis) + PA.preserve(); + return PA; + } + + SimplifyCFGOptions Options; + auto &TTI = AM.getResult(F); + Options.AC = &AM.getResult(F); + + bool Changed = false; + LLVM_DEBUG(dbgs() << "TaskSimplify running on function " << F.getName() + << "\n"); + + if (SimplifyTaskFrames) { + // Simplify taskframes. If anything changed, update the analysis. + Changed |= simplifyTaskFrames(TI, DT); + if (Changed) { + TI.recalculate(F, DT); + if (TI.isSerial()) { + if (PostCleanupCFG && SplitBlocks) + simplifyFunctionCFG(F, TTI, &DT, Options); + PA.preserve(); + return PA; + } + } + } + + // Evaluate the tasks that might be in parallel with each spindle, and + // determine number of discriminating syncs: syncs that sync a subset of the + // detached tasks, based on sync regions. + MaybeParallelTasks MPTasks; + TI.evaluateParallelState(MPTasks); + + // Simplify syncs in each task in the function. + for (Task *T : post_order(TI.getRootTask())) + Changed |= simplifySyncs(T, MPTasks); + + // Simplify each task in the function. + for (Task *T : post_order(TI.getRootTask())) + Changed |= simplifyTask(T); + + if (PostCleanupCFG && (Changed | SplitBlocks)) + Changed |= simplifyFunctionCFG(F, TTI, nullptr, Options); + + if (!Changed) { + PA.preserve(); + PA.preserve(); + return PA; + } + PA = PreservedAnalyses::none(); + return PA; +} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 68363abdb817a4a..717c27dbc332320 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -91,6 +91,7 @@ #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TapirTaskInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -2591,6 +2592,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); + assert(!isa(LoopVectorPreHeader->getTerminator()) && + "Loop preheader terminated by sync."); LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && "multiple exit loop without required epilogue?"); diff --git a/llvm/projects/CMakeLists.txt b/llvm/projects/CMakeLists.txt index 08f2fa522420b0e..9fc530b950bec19 100644 --- a/llvm/projects/CMakeLists.txt +++ b/llvm/projects/CMakeLists.txt @@ -11,7 +11,9 @@ foreach(entry ${entries}) (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/libunwind) AND (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/test-suite) AND (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/openmp) AND - (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/cross-project-tests)) + (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/cross-project-tests) AND + (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/cheetah) AND + (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/cilktools)) get_filename_component(entry_name "${entry}" NAME) add_llvm_external_project(${entry_name}) endif() @@ -28,6 +30,8 @@ if(${LLVM_BUILD_RUNTIME}) if(NOT MSVC OR LLVM_FORCE_BUILD_RUNTIME) # Add the projects in reverse order of their dependencies so that the # dependent projects can see the target names of their dependencies. + add_llvm_external_project(cilktools) + add_llvm_external_project(cheetah) add_llvm_external_project(libunwind) add_llvm_external_project(pstl) add_llvm_external_project(libc) diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt index 42b1b86ebaadf0e..f02395df20143c9 100644 --- a/llvm/runtimes/CMakeLists.txt +++ b/llvm/runtimes/CMakeLists.txt @@ -264,6 +264,8 @@ function(runtime_default_target) -DCMAKE_C_COMPILER_WORKS=ON -DCMAKE_CXX_COMPILER_WORKS=ON -DCMAKE_ASM_COMPILER_WORKS=ON + # TODO: Check if this argument is still needed. + -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET} ${COMMON_CMAKE_ARGS} ${RUNTIMES_CMAKE_ARGS} ${ARG_CMAKE_ARGS} diff --git a/llvm/test/Analysis/TapirRaceDetect/bitcast-function.ll b/llvm/test/Analysis/TapirRaceDetect/bitcast-function.ll new file mode 100644 index 000000000000000..c1b5c5aaa7c1a35 --- /dev/null +++ b/llvm/test/Analysis/TapirRaceDetect/bitcast-function.ll @@ -0,0 +1,27 @@ +; Check static race detection with calls to bitcast functions in +; blocks terminated by unreachable. +; +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: sanitize_cilk +define dso_local void @setup() local_unnamed_addr #0 { +entry: + tail call void (i32, ...) bitcast (void (...)* @bpnn_initialize to void (i32, ...)*)(i32 7) #2 + unreachable +} + +; CHECK: tail call void (i32, ...) @bpnn_initialize(i32 7) +; CHECK: Opaque +; CHECK: Opaque racer + +declare dso_local void @bpnn_initialize(...) local_unnamed_addr #1 + +attributes #0 = { sanitize_cilk } +attributes #1 = { "use-soft-float"="false" } +attributes #2 = { nounwind } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 12.0.0 (git@github.com:OpenCilk/opencilk-project.git 33ec1ef302b9173b44ffda58e6ad9447b803598a)"} diff --git a/llvm/test/Analysis/TapirRaceDetect/tapir-rd-objects.ll b/llvm/test/Analysis/TapirRaceDetect/tapir-rd-objects.ll new file mode 100644 index 000000000000000..72b0bc758f3e02c --- /dev/null +++ b/llvm/test/Analysis/TapirRaceDetect/tapir-rd-objects.ll @@ -0,0 +1,6079 @@ +; RUN: opt < %s -passes='print' -aa-pipeline=default -evaluate-aa-metadata -disable-output 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.cilk_c_monoid = type { void (i8*, i8*, i8*)*, void (i8*, i8*)*, void (i8*, i8*)*, i8* (i8*, i64)*, void (i8*, i8*)* } +%class.Graph = type { i32, i32, i32*, i32* } +%class.Bag_reducer = type { %"class.cilk::reducer" } +%"class.cilk::reducer" = type { %"class.cilk::internal::reducer_content.base", i8 } +%"class.cilk::internal::reducer_content.base" = type <{ %"class.cilk::internal::reducer_base", [127 x i8] }> +%"class.cilk::internal::reducer_base" = type { %struct.__cilkrts_hyperobject_base, %"class.cilk::internal::storage_for_object", i8* } +%struct.__cilkrts_hyperobject_base = type { %struct.cilk_c_monoid, i64, i64, i64 } +%"class.cilk::internal::storage_for_object" = type { %"class.cilk::internal::aligned_storage" } +%"class.cilk::internal::aligned_storage" = type { [1 x i8] } +%class.Bag = type <{ i32, [4 x i8], %class.Pennant**, i32*, i32, [4 x i8] }> +%class.Pennant = type { i32*, %class.Pennant*, %class.Pennant* } + +$_ZNK5Graph13pbfs_walk_BagEP3BagIiEP11Bag_reducerIiEjPj = comdat any + +; Function Attrs: inlinehint uwtable +define linkonce_odr dso_local void @_ZNK5Graph13pbfs_walk_BagEP3BagIiEP11Bag_reducerIiEjPj(%class.Graph* %this, %class.Bag* %b, %class.Bag_reducer* %next, i32 %newdist, i32* %distances) local_unnamed_addr #10 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !2219 { +entry: + %syncreg = tail call token @llvm.syncregion.start() + call void @llvm.dbg.value(metadata %class.Graph* %this, metadata !2221, metadata !DIExpression()), !dbg !2240 + call void @llvm.dbg.value(metadata %class.Bag* %b, metadata !2222, metadata !DIExpression()), !dbg !2241 + call void @llvm.dbg.value(metadata %class.Bag_reducer* %next, metadata !2223, metadata !DIExpression()), !dbg !2242 + call void @llvm.dbg.value(metadata i32 %newdist, metadata !2224, metadata !DIExpression()), !dbg !2243 + call void @llvm.dbg.value(metadata i32* %distances, metadata !2225, metadata !DIExpression()), !dbg !2244 + call void @llvm.dbg.value(metadata %class.Bag* %b, metadata !2245, metadata !DIExpression()), !dbg !2248 + %fill.i = getelementptr inbounds %class.Bag, %class.Bag* %b, i64 0, i32 0, !dbg !2250 + %0 = load i32, i32* %fill.i, align 8, !dbg !2250, !tbaa !1992 + %cmp = icmp eq i32 %0, 0, !dbg !2251 + br i1 %cmp, label %if.else, label %if.end.i, !dbg !2252 + +if.end.i: ; preds = %entry + call void @llvm.dbg.value(metadata %class.Pennant* null, metadata !2226, metadata !DIExpression()), !dbg !2253 + call void @llvm.dbg.value(metadata %class.Pennant** undef, metadata !2226, metadata !DIExpression(DW_OP_deref)), !dbg !2253 + call void @llvm.dbg.value(metadata %class.Bag* %b, metadata !2254, metadata !DIExpression()), !dbg !2258 + call void @llvm.dbg.value(metadata %class.Pennant** undef, metadata !2257, metadata !DIExpression()), !dbg !2260 + %dec.i = add i32 %0, -1, !dbg !2261 + store i32 %dec.i, i32* %fill.i, align 8, !dbg !2261, !tbaa !1992 + %bag.i = getelementptr inbounds %class.Bag, %class.Bag* %b, i64 0, i32 2, !dbg !2262 + %1 = load %class.Pennant**, %class.Pennant*** %bag.i, align 8, !dbg !2262, !tbaa !2188 + %idxprom.i = zext i32 %dec.i to i64, !dbg !2263 + %arrayidx.i = getelementptr inbounds %class.Pennant*, %class.Pennant** %1, i64 %idxprom.i, !dbg !2263 + %2 = load %class.Pennant*, %class.Pennant** %arrayidx.i, align 8, !dbg !2263, !tbaa !1901 + store %class.Pennant* null, %class.Pennant** %arrayidx.i, align 8, !dbg !2264, !tbaa !1901 + %cmp921.i = icmp eq i32 %dec.i, 0, !dbg !2265 + br i1 %cmp921.i, label %_ZN3BagIiE5splitEPP7PennantIiE.exit, label %for.body.lr.ph.i, !dbg !2268 + +for.body.lr.ph.i: ; preds = %if.end.i + %3 = load %class.Pennant**, %class.Pennant*** %bag.i, align 8, !tbaa !2188 + br label %for.body.i, !dbg !2268 + +for.body.i: ; preds = %for.inc.i, %for.body.lr.ph.i + %indvars.iv.i = phi i64 [ %idxprom.i, %for.body.lr.ph.i ], [ %indvars.iv.next.i, %for.inc.i ] + %4 = trunc i64 %indvars.iv.i to i32, !dbg !2269 + %sub.i = add i32 %4, -1, !dbg !2269 + %idxprom12.i = zext i32 %sub.i to i64, !dbg !2272 + %arrayidx13.i = getelementptr inbounds %class.Pennant*, %class.Pennant** %3, i64 %idxprom12.i, !dbg !2272 + %5 = load %class.Pennant*, %class.Pennant** %arrayidx13.i, align 8, !dbg !2272, !tbaa !1901 + %cmp14.i = icmp eq %class.Pennant* %5, null, !dbg !2273 + br i1 %cmp14.i, label %for.inc.i, label %_ZN3BagIiE5splitEPP7PennantIiE.exit, !dbg !2274 + +for.inc.i: ; preds = %for.body.i + store i32 %sub.i, i32* %fill.i, align 8, !dbg !2275, !tbaa !1992 + %cmp9.i = icmp eq i32 %sub.i, 0, !dbg !2265 + %indvars.iv.next.i = add nsw i64 %indvars.iv.i, -1, !dbg !2269 + br i1 %cmp9.i, label %_ZN3BagIiE5splitEPP7PennantIiE.exit, label %for.body.i, !dbg !2268, !llvm.loop !2276 + +_ZN3BagIiE5splitEPP7PennantIiE.exit: ; preds = %for.body.i, %for.inc.i, %if.end.i + detach within %syncreg, label %det.achd, label %det.cont unwind label %lpad3, !dbg !2279 + +det.achd: ; preds = %_ZN3BagIiE5splitEPP7PennantIiE.exit + invoke void @_ZNK5Graph13pbfs_walk_BagEP3BagIiEP11Bag_reducerIiEjPj(%class.Graph* %this, %class.Bag* nonnull %b, %class.Bag_reducer* %next, i32 %newdist, i32* %distances) + to label %invoke.cont unwind label %lpad, !dbg !2279 + +invoke.cont: ; preds = %det.achd + reattach within %syncreg, label %det.cont, !dbg !2279 + +det.cont: ; preds = %_ZN3BagIiE5splitEPP7PennantIiE.exit, %invoke.cont + call void @llvm.dbg.value(metadata %class.Pennant* %2, metadata !2226, metadata !DIExpression()), !dbg !2253 + invoke void @_ZNK5Graph17pbfs_walk_PennantEP7PennantIiEP11Bag_reducerIiEjPj(%class.Graph* %this, %class.Pennant* %2, %class.Bag_reducer* %next, i32 %newdist, i32* %distances) + to label %invoke.cont7 unwind label %lpad3, !dbg !2280 + +invoke.cont7: ; preds = %det.cont + sync within %syncreg, label %if.end, !dbg !2281 + +lpad: ; preds = %det.achd + %6 = landingpad { i8*, i32 } + catch i8* null, !dbg !2282 + invoke void @llvm.detached.rethrow.sl_p0i8i32s(token %syncreg, { i8*, i32 } %6) + to label %det.rethrow.unreachable unwind label %lpad3, !dbg !2279 + +det.rethrow.unreachable: ; preds = %lpad + unreachable, !dbg !2279 + +lpad3: ; preds = %det.cont, %_ZN3BagIiE5splitEPP7PennantIiE.exit, %lpad + %7 = landingpad { i8*, i32 } + cleanup, !dbg !2282 + %8 = extractvalue { i8*, i32 } %7, 0, !dbg !2282 + %9 = extractvalue { i8*, i32 } %7, 1, !dbg !2282 + sync within %syncreg, label %eh.resume, !dbg !2283 + +if.else: ; preds = %entry + call void @llvm.dbg.value(metadata %class.Bag* %b, metadata !2284, metadata !DIExpression()), !dbg !2287 + %size.i = getelementptr inbounds %class.Bag, %class.Bag* %b, i64 0, i32 4, !dbg !2289 + %10 = load i32, i32* %size.i, align 8, !dbg !2289, !tbaa !1998 + call void @llvm.dbg.value(metadata i32 %10, metadata !2229, metadata !DIExpression()), !dbg !2290 + call void @llvm.dbg.value(metadata %class.Bag* %b, metadata !2291, metadata !DIExpression()), !dbg !2294 + %filling.i182 = getelementptr inbounds %class.Bag, %class.Bag* %b, i64 0, i32 3, !dbg !2296 + %11 = load i32*, i32** %filling.i182, align 8, !dbg !2296, !tbaa !2155 + call void @llvm.dbg.value(metadata i32* %11, metadata !2231, metadata !DIExpression()), !dbg !2297 + %rem = srem i32 %10, 256, !dbg !2298 + call void @llvm.dbg.value(metadata i32 %rem, metadata !2232, metadata !DIExpression()), !dbg !2299 + %nodes = getelementptr inbounds %class.Graph, %class.Graph* %this, i64 0, i32 2, !dbg !2300 + %12 = load i32*, i32** %nodes, align 8, !dbg !2300, !tbaa !1682 + %edges = getelementptr inbounds %class.Graph, %class.Graph* %this, i64 0, i32 3, !dbg !2301 + %13 = load i32*, i32** %edges, align 8, !dbg !2301, !tbaa !1687 + detach within %syncreg, label %det.achd13, label %det.cont18 unwind label %lpad19, !dbg !2302 + +det.achd13: ; preds = %if.else + %syncreg.i = tail call token @llvm.syncregion.start() + %idx.ext = sext i32 %10 to i64, !dbg !2303 + %add.ptr = getelementptr inbounds i32, i32* %11, i64 %idx.ext, !dbg !2303 + %narrow = sub nsw i32 0, %rem, !dbg !2304 + %idx.neg = sext i32 %narrow to i64, !dbg !2304 + %add.ptr12 = getelementptr inbounds i32, i32* %add.ptr, i64 %idx.neg, !dbg !2304 + call void @llvm.dbg.value(metadata i32* %add.ptr12, metadata !2305, metadata !DIExpression()), !dbg !2341 + call void @llvm.dbg.value(metadata i32 %rem, metadata !2311, metadata !DIExpression()), !dbg !2343 + call void @llvm.dbg.value(metadata %class.Bag_reducer* %next, metadata !2312, metadata !DIExpression()), !dbg !2344 + call void @llvm.dbg.value(metadata i32 %newdist, metadata !2313, metadata !DIExpression()), !dbg !2345 + call void @llvm.dbg.value(metadata i32* %distances, metadata !2314, metadata !DIExpression()), !dbg !2346 + call void @llvm.dbg.value(metadata i32* %12, metadata !2315, metadata !DIExpression()), !dbg !2347 + call void @llvm.dbg.value(metadata i32* %13, metadata !2316, metadata !DIExpression()), !dbg !2348 + call void @llvm.dbg.value(metadata %class.Bag_reducer* %next, metadata !2028, metadata !DIExpression()), !dbg !2349 + call void @llvm.dbg.value(metadata %class.Bag_reducer* %next, metadata !2009, metadata !DIExpression()), !dbg !2351 + call void @llvm.dbg.value(metadata %class.Bag_reducer* %next, metadata !1978, metadata !DIExpression()), !dbg !2353 + %m_base.i.i.i.i = getelementptr inbounds %class.Bag_reducer, %class.Bag_reducer* %next, i64 0, i32 0, i32 0, i32 0, i32 0, !dbg !2355 + %call.i.i.i.i108 = invoke i8* @__cilkrts_hyper_lookup(%struct.__cilkrts_hyperobject_base* %m_base.i.i.i.i) + to label %call.i.i.i.i.noexc unwind label %lpad14.loopexit.split-lp, !dbg !2356 + +call.i.i.i.i.noexc: ; preds = %det.achd13 + call void @llvm.dbg.value(metadata i8* %call.i.i.i.i108, metadata !2317, metadata !DIExpression()), !dbg !2357 + call void @llvm.dbg.value(metadata i32 0, metadata !2318, metadata !DIExpression()), !dbg !2358 + %cmp105.i = icmp sgt i32 %rem, 0, !dbg !2359 + br i1 %cmp105.i, label %for.body.preheader.i, label %invoke.cont17, !dbg !2360 + +for.body.preheader.i: ; preds = %call.i.i.i.i.noexc + %14 = sext i32 %rem to i64, !dbg !2361 + %filling.i183 = getelementptr inbounds i8, i8* %call.i.i.i.i108, i64 16 + %15 = bitcast i8* %filling.i183 to i32** + %size.i184 = getelementptr inbounds i8, i8* %call.i.i.i.i108, i64 24 + %16 = bitcast i8* %size.i184 to i32* + %17 = bitcast i8* %filling.i183 to i8** + %fill.i191 = bitcast i8* %call.i.i.i.i108 to i32* + %bag.i192 = getelementptr inbounds i8, i8* %call.i.i.i.i108, i64 8 + %18 = bitcast i8* %bag.i192 to %class.Pennant*** + br label %for.body.i102, !dbg !2361 + +for.body.i102: ; preds = %if.end44.i, %for.body.preheader.i + %indvars.iv111.i = phi i64 [ 0, %for.body.preheader.i ], [ %indvars.iv.next112.i, %if.end44.i ] + call void @llvm.dbg.value(metadata i64 %indvars.iv111.i, metadata !2318, metadata !DIExpression()), !dbg !2358 + %arrayidx.i100 = getelementptr inbounds i32, i32* %add.ptr12, i64 %indvars.iv111.i, !dbg !2361 + %19 = load i32, i32* %arrayidx.i100, align 4, !dbg !2361, !tbaa !1701 + %idxprom1.i = sext i32 %19 to i64, !dbg !2362 + %arrayidx2.i = getelementptr inbounds i32, i32* %12, i64 %idxprom1.i, !dbg !2362 + %20 = load i32, i32* %arrayidx2.i, align 4, !dbg !2362, !tbaa !1701 + call void @llvm.dbg.value(metadata i32 %20, metadata !2320, metadata !DIExpression()), !dbg !2363 + %add.i = add nsw i32 %19, 1, !dbg !2364 + %idxprom5.i = sext i32 %add.i to i64, !dbg !2365 + %arrayidx6.i = getelementptr inbounds i32, i32* %12, i64 %idxprom5.i, !dbg !2365 + %21 = load i32, i32* %arrayidx6.i, align 4, !dbg !2365, !tbaa !1701 + call void @llvm.dbg.value(metadata i32 %21, metadata !2323, metadata !DIExpression()), !dbg !2366 + %sub.i101 = sub i32 %21, %20, !dbg !2367 + %cmp7.i = icmp slt i32 %sub.i101, 128, !dbg !2368 + %cmp9103.i = icmp sgt i32 %21, %20, !dbg !2369 + br i1 %cmp7.i, label %for.cond8.preheader.i, label %if.else.i, !dbg !2370 + +for.cond8.preheader.i: ; preds = %for.body.i102 + call void @llvm.dbg.value(metadata i32 %20, metadata !2324, metadata !DIExpression()), !dbg !2371 + br i1 %cmp9103.i, label %for.body11.preheader.i, label %if.end44.i, !dbg !2372 + +for.body11.preheader.i: ; preds = %for.cond8.preheader.i + %22 = sext i32 %20 to i64, !dbg !2373 + br label %for.body11.i, !dbg !2373 + +for.body11.i: ; preds = %if.end.i104, %for.body11.preheader.i + %indvars.iv108.i = phi i64 [ %22, %for.body11.preheader.i ], [ %indvars.iv.next109.i, %if.end.i104 ] + call void @llvm.dbg.value(metadata i64 %indvars.iv108.i, metadata !2324, metadata !DIExpression()), !dbg !2371 + %arrayidx13.i103 = getelementptr inbounds i32, i32* %13, i64 %indvars.iv108.i, !dbg !2373 + %23 = load i32, i32* %arrayidx13.i103, align 4, !dbg !2373, !tbaa !1701 + call void @llvm.dbg.value(metadata i32 %23, metadata !2328, metadata !DIExpression()), !dbg !2374 + %idxprom14.i = sext i32 %23 to i64, !dbg !2375 + %arrayidx15.i = getelementptr inbounds i32, i32* %distances, i64 %idxprom14.i, !dbg !2375 + %24 = load i32, i32* %arrayidx15.i, align 4, !dbg !2375, !tbaa !1701 + %cmp16.i = icmp ugt i32 %24, %newdist, !dbg !2377 + br i1 %cmp16.i, label %if.then17.i, label %if.end.i104, !dbg !2378 + +if.then17.i: ; preds = %for.body11.i + call void @llvm.dbg.value(metadata i8* %call.i.i.i.i108, metadata !2143, metadata !DIExpression()), !dbg !2379 + call void @llvm.dbg.value(metadata i32 %23, metadata !2140, metadata !DIExpression()), !dbg !2382 + %25 = load i32*, i32** %15, align 8, !dbg !2383, !tbaa !2155 + %26 = load i32, i32* %16, align 8, !dbg !2384, !tbaa !1998 + %inc.i185 = add i32 %26, 1, !dbg !2384 + store i32 %inc.i185, i32* %16, align 8, !dbg !2384, !tbaa !1998 + %idxprom.i186 = zext i32 %26 to i64, !dbg !2385 + %arrayidx.i187 = getelementptr inbounds i32, i32* %25, i64 %idxprom.i186, !dbg !2385 + store i32 %23, i32* %arrayidx.i187, align 4, !dbg !2386, !tbaa !1701 + %27 = load i32, i32* %16, align 8, !dbg !2387, !tbaa !1998 + %cmp.i188 = icmp ult i32 %27, 2048, !dbg !2388 + br i1 %cmp.i188, label %.noexc, label %if.end.i193, !dbg !2389 + +if.end.i193: ; preds = %if.then17.i + %call.i221 = invoke i8* @_Znwm(i64 24) #19 + to label %call.i.noexc220 unwind label %lpad14.loopexit, !dbg !2390 + +call.i.noexc220: ; preds = %if.end.i193 + call void @llvm.dbg.value(metadata i32* %25, metadata !2169, metadata !DIExpression()) #2, !dbg !2391 + %els.i.i189 = bitcast i8* %call.i221 to i32**, !dbg !2393 + store i32* %25, i32** %els.i.i189, align 8, !dbg !2394, !tbaa !2176 + %l.i.i190 = getelementptr inbounds i8, i8* %call.i221, i64 8, !dbg !2395 + tail call void @llvm.memset.p0i8.i64(i8* nonnull align 8 %l.i.i190, i8 0, i64 16, i1 false) #2, !dbg !2396 + %call4.i223 = invoke i8* @_Znam(i64 8192) #19 + to label %call4.i.noexc222 unwind label %lpad14.loopexit, !dbg !2397 + +call4.i.noexc222: ; preds = %call.i.noexc220 + %28 = bitcast i8* %call.i221 to %class.Pennant*, !dbg !2390 + call void @llvm.dbg.value(metadata %class.Pennant* %28, metadata !2144, metadata !DIExpression()), !dbg !2398 + call void @llvm.dbg.value(metadata %class.Pennant* %28, metadata !2166, metadata !DIExpression()) #2, !dbg !2399 + store i8* %call4.i223, i8** %17, align 8, !dbg !2400, !tbaa !2155 + store i32 0, i32* %16, align 8, !dbg !2401, !tbaa !1998 + call void @llvm.dbg.value(metadata i32 0, metadata !2145, metadata !DIExpression()), !dbg !2402 + %29 = load i32, i32* %fill.i191, align 8, !tbaa !1992 + %30 = zext i32 %29 to i64, !dbg !2403 + br label %do.body.i197, !dbg !2403 + +do.body.i197: ; preds = %if.then11.i208.1, %call4.i.noexc222 + %indvars.iv254 = phi i64 [ 0, %call4.i.noexc222 ], [ %indvars.iv.next255.1, %if.then11.i208.1 ], !dbg !2379 + %c.0.i195 = phi %class.Pennant* [ %28, %call4.i.noexc222 ], [ %107, %if.then11.i208.1 ], !dbg !2379 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195, metadata !2144, metadata !DIExpression()), !dbg !2398 + call void @llvm.dbg.value(metadata i64 %indvars.iv254, metadata !2145, metadata !DIExpression()), !dbg !2402 + %cmp7.i196 = icmp ult i64 %indvars.iv254, %30, !dbg !2404 + %31 = load %class.Pennant**, %class.Pennant*** %18, align 8, !dbg !2405, !tbaa !2188 + br i1 %cmp7.i196, label %land.lhs.true.i203, label %if.else.i217, !dbg !2406 + +land.lhs.true.i203: ; preds = %do.body.i197 + %arrayidx9.i201 = getelementptr inbounds %class.Pennant*, %class.Pennant** %31, i64 %indvars.iv254, !dbg !2407 + %32 = load %class.Pennant*, %class.Pennant** %arrayidx9.i201, align 8, !dbg !2407, !tbaa !1901 + %cmp10.i202 = icmp eq %class.Pennant* %32, null, !dbg !2408 + br i1 %cmp10.i202, label %38, label %if.then11.i208, !dbg !2409 + +if.then11.i208: ; preds = %land.lhs.true.i203 + call void @llvm.dbg.value(metadata %class.Pennant* %32, metadata !2193, metadata !DIExpression()), !dbg !2410 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195, metadata !2196, metadata !DIExpression()), !dbg !2412 + %l.i48.i204 = getelementptr inbounds %class.Pennant, %class.Pennant* %32, i64 0, i32 1, !dbg !2413 + %33 = bitcast %class.Pennant** %l.i48.i204 to i64*, !dbg !2413 + %34 = load i64, i64* %33, align 8, !dbg !2413, !tbaa !2202 + %r.i.i205 = getelementptr inbounds %class.Pennant, %class.Pennant* %c.0.i195, i64 0, i32 2, !dbg !2414 + %35 = bitcast %class.Pennant** %r.i.i205 to i64*, !dbg !2415 + store i64 %34, i64* %35, align 8, !dbg !2415, !tbaa !2205 + store %class.Pennant* %c.0.i195, %class.Pennant** %l.i48.i204, align 8, !dbg !2416, !tbaa !2202 + call void @llvm.dbg.value(metadata %class.Pennant* %32, metadata !2144, metadata !DIExpression()), !dbg !2398 + store %class.Pennant* null, %class.Pennant** %arrayidx9.i201, align 8, !dbg !2417, !tbaa !1901 + %indvars.iv.next255 = or i64 %indvars.iv254, 1, !dbg !2418 + call void @llvm.dbg.value(metadata i32 undef, metadata !2145, metadata !DIExpression(DW_OP_plus_uconst, 1, DW_OP_stack_value)), !dbg !2402 + call void @llvm.dbg.value(metadata %class.Pennant* %32, metadata !2144, metadata !DIExpression()), !dbg !2398 + call void @llvm.dbg.value(metadata i64 %indvars.iv.next255, metadata !2145, metadata !DIExpression()), !dbg !2402 + %cmp7.i196.1 = icmp ult i64 %indvars.iv.next255, %30, !dbg !2404 + %36 = load %class.Pennant**, %class.Pennant*** %18, align 8, !dbg !2405, !tbaa !2188 + br i1 %cmp7.i196.1, label %land.lhs.true.i203.1, label %if.else.i217, !dbg !2406 + +if.else.i217: ; preds = %if.then11.i208, %do.body.i197 + %indvars.iv254.lcssa = phi i64 [ %indvars.iv254, %do.body.i197 ], [ %indvars.iv.next255, %if.then11.i208 ], !dbg !2379 + %c.0.i195.lcssa = phi %class.Pennant* [ %c.0.i195, %do.body.i197 ], [ %32, %if.then11.i208 ], !dbg !2379 + %.lcssa337 = phi %class.Pennant** [ %31, %do.body.i197 ], [ %36, %if.then11.i208 ], !dbg !2405 + call void @llvm.dbg.value(metadata i64 %indvars.iv254.lcssa, metadata !2145, metadata !DIExpression()), !dbg !2402 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195.lcssa, metadata !2144, metadata !DIExpression()), !dbg !2398 + call void @llvm.dbg.value(metadata i64 %indvars.iv254.lcssa, metadata !2145, metadata !DIExpression()), !dbg !2402 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195.lcssa, metadata !2144, metadata !DIExpression()), !dbg !2398 + call void @llvm.dbg.value(metadata i64 %indvars.iv254.lcssa, metadata !2145, metadata !DIExpression()), !dbg !2402 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195.lcssa, metadata !2144, metadata !DIExpression()), !dbg !2398 + %37 = trunc i64 %indvars.iv254.lcssa to i32, !dbg !2406 + call void @llvm.dbg.value(metadata i64 %indvars.iv254.lcssa, metadata !2145, metadata !DIExpression()), !dbg !2402 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195.lcssa, metadata !2144, metadata !DIExpression()), !dbg !2398 + call void @llvm.dbg.value(metadata i32 %37, metadata !2145, metadata !DIExpression()), !dbg !2402 + call void @llvm.dbg.value(metadata i32 %37, metadata !2145, metadata !DIExpression()), !dbg !2402 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195.lcssa, metadata !2144, metadata !DIExpression()), !dbg !2398 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195.lcssa, metadata !2144, metadata !DIExpression()), !dbg !2398 + call void @llvm.dbg.value(metadata i32 %37, metadata !2145, metadata !DIExpression()), !dbg !2402 + call void @llvm.dbg.value(metadata i32 %37, metadata !2145, metadata !DIExpression()), !dbg !2402 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195.lcssa, metadata !2144, metadata !DIExpression()), !dbg !2398 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195.lcssa, metadata !2144, metadata !DIExpression()), !dbg !2398 + %idxprom20.pre-phi.i210 = and i64 %indvars.iv254.lcssa, 4294967295, !dbg !2419 + call void @llvm.dbg.value(metadata i32 %37, metadata !2145, metadata !DIExpression()), !dbg !2402 + call void @llvm.dbg.value(metadata i32 %37, metadata !2145, metadata !DIExpression()), !dbg !2402 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195.lcssa, metadata !2144, metadata !DIExpression()), !dbg !2398 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195.lcssa, metadata !2144, metadata !DIExpression()), !dbg !2398 + call void @llvm.dbg.value(metadata i32 %37, metadata !2145, metadata !DIExpression()), !dbg !2402 + call void @llvm.dbg.value(metadata i32 %37, metadata !2145, metadata !DIExpression()), !dbg !2402 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195.lcssa, metadata !2144, metadata !DIExpression()), !dbg !2398 + call void @llvm.dbg.value(metadata %class.Pennant* %c.0.i195.lcssa, metadata !2144, metadata !DIExpression()), !dbg !2398 + %arrayidx21.i211 = getelementptr inbounds %class.Pennant*, %class.Pennant** %.lcssa337, i64 %idxprom20.pre-phi.i210, !dbg !2419 + store %class.Pennant* %c.0.i195.lcssa, %class.Pennant** %arrayidx21.i211, align 8, !dbg !2420, !tbaa !1901 + call void @llvm.dbg.value(metadata i32 %29, metadata !2146, metadata !DIExpression()), !dbg !2421 + %add.i212 = add nuw i32 %37, 1, !dbg !2421 + call void @llvm.dbg.value(metadata i32 %add.i212, metadata !2151, metadata !DIExpression()), !dbg !2421 + %xor.i213 = xor i32 %add.i212, %29, !dbg !2421 + br label %39, !dbg !2421 + +;