diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.h b/llvm/include/llvm/CodeGenData/CodeGenData.h new file mode 100644 index 00000000000000..118fb9841d27e8 --- /dev/null +++ b/llvm/include/llvm/CodeGenData/CodeGenData.h @@ -0,0 +1,202 @@ +//===- CodeGenData.h --------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains support for codegen data that has stable summary which +// can be used to optimize the code in the subsequent codegen. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGENDATA_CODEGENDATA_H +#define LLVM_CODEGENDATA_CODEGENDATA_H + +#include "llvm/ADT/BitmaskEnum.h" +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/CodeGenData/OutlinedHashTree.h" +#include "llvm/CodeGenData/OutlinedHashTreeRecord.h" +#include "llvm/IR/Module.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/TargetParser/Triple.h" +#include + +namespace llvm { + +enum CGDataSectKind { +#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix) Kind, +#include "llvm/CodeGenData/CodeGenData.inc" +}; + +std::string getCodeGenDataSectionName(CGDataSectKind CGSK, + Triple::ObjectFormatType OF, + bool AddSegmentInfo = true); + +enum class CGDataKind { + Unknown = 0x0, + // A function outlining info. + FunctionOutlinedHashTree = 0x1, + LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/FunctionOutlinedHashTree) +}; + +const std::error_category &cgdata_category(); + +enum class cgdata_error { + success = 0, + eof, + bad_magic, + bad_header, + empty_cgdata, + malformed, + unsupported_version, +}; + +inline std::error_code make_error_code(cgdata_error E) { + return std::error_code(static_cast(E), cgdata_category()); +} + +class CGDataError : public ErrorInfo { +public: + CGDataError(cgdata_error Err, const Twine &ErrStr = Twine()) + : Err(Err), Msg(ErrStr.str()) { + assert(Err != cgdata_error::success && "Not an error"); + } + + std::string message() const override; + + void log(raw_ostream &OS) const override { OS << message(); } + + std::error_code convertToErrorCode() const override { + return make_error_code(Err); + } + + cgdata_error get() const { return Err; } + const std::string &getMessage() const { return Msg; } + + /// Consume an Error and return the raw enum value contained within it, and + /// the optional error message. The Error must either be a success value, or + /// contain a single CGDataError. + static std::pair take(Error E) { + auto Err = cgdata_error::success; + std::string Msg = ""; + handleAllErrors(std::move(E), [&Err, &Msg](const CGDataError &IPE) { + assert(Err == cgdata_error::success && "Multiple errors encountered"); + Err = IPE.get(); + Msg = IPE.getMessage(); + }); + return {Err, Msg}; + } + + static char ID; + +private: + cgdata_error Err; + std::string Msg; +}; + +enum CGDataMode { + None, + Read, + Write, +}; + +class CodeGenData { + /// Global outlined hash tree that has oulined hash sequences across modules. + std::unique_ptr PublishedHashTree; + + /// This flag is set when -fcgdata-generate is passed. + /// Or, it can be mutated with -ftwo-codegen-rounds during two codegen runs. + bool EmitCGData; + + /// This is a singleton instance which is thread-safe. Unlike profile data + /// which is largely function-based, codegen data describes the whole module. + /// Therefore, this can be initialized once, and can be used across modules + /// instead of constructing the same one for each codegen backend. + static std::unique_ptr Instance; + static std::once_flag OnceFlag; + + CodeGenData() = default; + +public: + ~CodeGenData() = default; + + static CodeGenData &getInstance(); + + /// Returns true if we have a valid outlined hash tree. + bool hasOutlinedHashTree() { + return PublishedHashTree && !PublishedHashTree->empty(); + } + + /// Returns the outlined hash tree. This can be globally used in a read-only + /// manner. + const OutlinedHashTree *getOutlinedHashTree() { + return PublishedHashTree.get(); + } + + /// Returns true if we should write codegen data. + bool emitCGData() { return EmitCGData; } + + /// Publish the (globally) merged or read outlined hash tree. + void publishOutlinedHashTree(std::unique_ptr HashTree) { + PublishedHashTree = std::move(HashTree); + // Ensure we disable emitCGData as we do not want to read and write both. + EmitCGData = false; + } +}; + +namespace cgdata { + +inline bool hasOutlinedHashTree() { + return CodeGenData::getInstance().hasOutlinedHashTree(); +} + +inline const OutlinedHashTree *getOutlinedHashTree() { + return CodeGenData::getInstance().getOutlinedHashTree(); +} + +inline bool emitCGData() { return CodeGenData::getInstance().emitCGData(); } + +inline void +publishOutlinedHashTree(std::unique_ptr HashTree) { + CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); +} + +void warn(Error E, StringRef Whence = ""); +void warn(Twine Message, std::string Whence = "", std::string Hint = ""); + +} // end namespace cgdata + +namespace IndexedCGData { + +const uint64_t Magic = 0x81617461646763ff; // "\xffcgdata\x81" + +enum CGDataVersion { + // Version 1 is the first version. This version support the outlined + // hash tree. + Version1 = 1, + CurrentVersion = CG_DATA_INDEX_VERSION +}; +const uint64_t Version = CGDataVersion::CurrentVersion; + +struct Header { + uint64_t Magic; + uint32_t Version; + uint32_t DataKind; + uint64_t OutlinedHashTreeOffset; + + // New fields should only be added at the end to ensure that the size + // computation is correct. The methods below need to be updated to ensure that + // the new field is read correctly. + + // Reads a header struct from the buffer. + static Expected
readFromBuffer(const unsigned char *Curr); +}; + +} // end namespace IndexedCGData + +} // end namespace llvm + +#endif // LLVM_CODEGEN_PREPARE_H diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.inc b/llvm/include/llvm/CodeGenData/CodeGenData.inc new file mode 100644 index 00000000000000..5f6df5c0bf1065 --- /dev/null +++ b/llvm/include/llvm/CodeGenData/CodeGenData.inc @@ -0,0 +1,46 @@ +/*===-- CodeGenData.inc ----------------------------------------*- C++ -*-=== *\ +|* +|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +|* See https://llvm.org/LICENSE.txt for license information. +|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +|* +\*===----------------------------------------------------------------------===*/ +/* + * This is the main file that defines all the data structure, signature, + * constant literals that are shared across compiler, host tools (reader/writer) + * to support codegen data. + * +\*===----------------------------------------------------------------------===*/ + +#ifdef CG_DATA_SECT_ENTRY +#define CG_DATA_DEFINED +CG_DATA_SECT_ENTRY(CG_outline, CG_DATA_QUOTE(CG_DATA_OUTLINE_COMMON), + CG_DATA_OUTLINE_COFF, "__DATA,") + +#undef CG_DATA_SECT_ENTRY +#endif + +/* section name strings common to all targets other + than WIN32 */ +#define CG_DATA_OUTLINE_COMMON __llvm_outline +/* Since cg data sections are not allocated, we don't need to + * access them at runtime. + */ +#define CG_DATA_OUTLINE_COFF ".loutline" + +#ifdef _WIN32 +/* Runtime section names and name strings. */ +#define CG_DATA_SECT_NAME CG_DATA_OUTLINE_COFF + +#else +/* Runtime section names and name strings. */ +#define CG_DATA_SECT_NAME INSTR_PROF_QUOTE(CG_DATA_OUTLINE_COMMON) + +#endif + +/* Indexed codegen data format version (start from 1). */ +#define CG_DATA_INDEX_VERSION 1 + +/* Helper macros. */ +#define CG_DATA_SIMPLE_QUOTE(x) #x +#define CG_DATA_QUOTE(x) CG_DATA_SIMPLE_QUOTE(x) diff --git a/llvm/include/llvm/CodeGenData/CodeGenDataReader.h b/llvm/include/llvm/CodeGenData/CodeGenDataReader.h new file mode 100644 index 00000000000000..df4ae3ed24e79a --- /dev/null +++ b/llvm/include/llvm/CodeGenData/CodeGenDataReader.h @@ -0,0 +1,154 @@ +//===- CodeGenDataReader.h --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains support for reading codegen data. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGENDATA_CODEGENDATAREADER_H +#define LLVM_CODEGENDATA_CODEGENDATAREADER_H + +#include "llvm/CodeGenData/CodeGenData.h" +#include "llvm/CodeGenData/OutlinedHashTreeRecord.h" +#include "llvm/Support/LineIterator.h" +#include "llvm/Support/VirtualFileSystem.h" + +namespace llvm { + +class CodeGenDataReader { + cgdata_error LastError = cgdata_error::success; + std::string LastErrorMsg; + +public: + CodeGenDataReader() = default; + virtual ~CodeGenDataReader() = default; + + /// Read the header. Required before reading first record. + virtual Error read() = 0; + /// Return the codegen data version. + virtual uint32_t getVersion() const = 0; + /// Return the codegen data kind. + virtual CGDataKind getDataKind() const = 0; + /// Return true if the data has an outlined hash tree. + virtual bool hasOutlinedHashTree() const = 0; + /// Return the outlined hash tree that is released from the reader. + std::unique_ptr releaseOutlinedHashTree() { + return std::move(HashTreeRecord.HashTree); + } + + /// Factory method to create an appropriately typed reader for the given + /// codegen data file path and file system. + static Expected> + create(const Twine &Path, vfs::FileSystem &FS); + + /// Factory method to create an appropriately typed reader for the given + /// memory buffer. + static Expected> + create(std::unique_ptr Buffer); + + /// Extract the cgdata embedded in sections from the given object file and + /// merge them into the GlobalOutlineRecord. This is a static helper that + /// is used by `llvm-cgdata merge` or ThinLTO's two-codegen rounds. + static Error mergeFromObjectFile(const object::ObjectFile *Obj, + OutlinedHashTreeRecord &GlobalOutlineRecord); + +protected: + /// The outlined hash tree that has been read. When it's released by + /// releaseOutlinedHashTree(), it's no longer valid. + OutlinedHashTreeRecord HashTreeRecord; + + /// Set the current error and return same. + Error error(cgdata_error Err, const std::string &ErrMsg = "") { + LastError = Err; + LastErrorMsg = ErrMsg; + if (Err == cgdata_error::success) + return Error::success(); + return make_error(Err, ErrMsg); + } + + Error error(Error &&E) { + handleAllErrors(std::move(E), [&](const CGDataError &IPE) { + LastError = IPE.get(); + LastErrorMsg = IPE.getMessage(); + }); + return make_error(LastError, LastErrorMsg); + } + + /// Clear the current error and return a successful one. + Error success() { return error(cgdata_error::success); } +}; + +class IndexedCodeGenDataReader : public CodeGenDataReader { + /// The codegen data file contents. + std::unique_ptr DataBuffer; + /// The header + IndexedCGData::Header Header; + +public: + IndexedCodeGenDataReader(std::unique_ptr DataBuffer) + : DataBuffer(std::move(DataBuffer)) {} + IndexedCodeGenDataReader(const IndexedCodeGenDataReader &) = delete; + IndexedCodeGenDataReader & + operator=(const IndexedCodeGenDataReader &) = delete; + + /// Return true if the given buffer is in binary codegen data format. + static bool hasFormat(const MemoryBuffer &Buffer); + /// Read the contents including the header. + Error read() override; + /// Return the codegen data version. + uint32_t getVersion() const override { return Header.Version; } + /// Return the codegen data kind. + CGDataKind getDataKind() const override { + return static_cast(Header.DataKind); + } + /// Return true if the header indicates the data has an outlined hash tree. + /// This does not mean that the data is still available. + bool hasOutlinedHashTree() const override { + return Header.DataKind & + static_cast(CGDataKind::FunctionOutlinedHashTree); + } +}; + +/// This format is a simple text format that's suitable for test data. +/// The header is a custom format starting with `:` per line to indicate which +/// codegen data is recorded. `#` is used to indicate a comment. +/// The subsequent data is a YAML format per each codegen data in order. +/// Currently, it only has a function outlined hash tree. +class TextCodeGenDataReader : public CodeGenDataReader { + /// The codegen data file contents. + std::unique_ptr DataBuffer; + /// Iterator over the profile data. + line_iterator Line; + /// Describe the kind of the codegen data. + CGDataKind DataKind = CGDataKind::Unknown; + +public: + TextCodeGenDataReader(std::unique_ptr DataBuffer_) + : DataBuffer(std::move(DataBuffer_)), Line(*DataBuffer, true, '#') {} + TextCodeGenDataReader(const TextCodeGenDataReader &) = delete; + TextCodeGenDataReader &operator=(const TextCodeGenDataReader &) = delete; + + /// Return true if the given buffer is in text codegen data format. + static bool hasFormat(const MemoryBuffer &Buffer); + /// Read the contents including the header. + Error read() override; + /// Text format does not have version, so return 0. + uint32_t getVersion() const override { return 0; } + /// Return the codegen data kind. + CGDataKind getDataKind() const override { return DataKind; } + /// Return true if the header indicates the data has an outlined hash tree. + /// This does not mean that the data is still available. + bool hasOutlinedHashTree() const override { + return static_cast(DataKind) & + static_cast(CGDataKind::FunctionOutlinedHashTree); + } +}; + +} // end namespace llvm + +#endif // LLVM_CODEGENDATA_CODEGENDATAREADER_H diff --git a/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h b/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h new file mode 100644 index 00000000000000..e17ffc3482ec91 --- /dev/null +++ b/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h @@ -0,0 +1,68 @@ +//===- CodeGenDataWriter.h --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains support for writing codegen data. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGENDATA_CODEGENDATAWRITER_H +#define LLVM_CODEGENDATA_CODEGENDATAWRITER_H + +#include "llvm/CodeGenData/CodeGenData.h" +#include "llvm/CodeGenData/OutlinedHashTreeRecord.h" +#include "llvm/Support/Error.h" + +namespace llvm { + +class CGDataOStream; + +class CodeGenDataWriter { + /// The outlined hash tree to be written. + OutlinedHashTreeRecord HashTreeRecord; + + /// A bit mask describing the kind of the codegen data. + CGDataKind DataKind = CGDataKind::Unknown; + +public: + CodeGenDataWriter() = default; + ~CodeGenDataWriter() = default; + + /// Add the outlined hash tree record. The input Record is released. + void addRecord(OutlinedHashTreeRecord &Record); + + /// Write the codegen data to \c OS + Error write(raw_fd_ostream &OS); + + /// Write the codegen data in text format to \c OS + Error writeText(raw_fd_ostream &OS); + + /// Return the attributes of the current CGData. + CGDataKind getCGDataKind() const { return DataKind; } + + /// Return true if the header indicates the data has an outlined hash tree. + bool hasOutlinedHashTree() const { + return static_cast(DataKind) & + static_cast(CGDataKind::FunctionOutlinedHashTree); + } + +private: + /// The offset of the outlined hash tree in the file. + uint64_t OutlinedHashTreeOffset; + + /// Write the codegen data header to \c COS + Error writeHeader(CGDataOStream &COS); + + /// Write the codegen data header in text to \c OS + Error writeHeaderText(raw_fd_ostream &OS); + + Error writeImpl(CGDataOStream &COS); +}; + +} // end namespace llvm + +#endif // LLVM_CODEGENDATA_CODEGENDATAWRITER_H diff --git a/llvm/lib/CodeGenData/CMakeLists.txt b/llvm/lib/CodeGenData/CMakeLists.txt index 3ba90f96cc86d4..1156d53afb2e0f 100644 --- a/llvm/lib/CodeGenData/CMakeLists.txt +++ b/llvm/lib/CodeGenData/CMakeLists.txt @@ -1,4 +1,7 @@ add_llvm_component_library(LLVMCodeGenData + CodeGenData.cpp + CodeGenDataReader.cpp + CodeGenDataWriter.cpp OutlinedHashTree.cpp OutlinedHashTreeRecord.cpp diff --git a/llvm/lib/CodeGenData/CodeGenData.cpp b/llvm/lib/CodeGenData/CodeGenData.cpp new file mode 100644 index 00000000000000..3bd21c97c7de7a --- /dev/null +++ b/llvm/lib/CodeGenData/CodeGenData.cpp @@ -0,0 +1,197 @@ +//===-- CodeGenData.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains support for codegen data that has stable summary which +// can be used to optimize the code in the subsequent codegen. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CodeGenData/CodeGenDataReader.h" +#include "llvm/CodeGenData/OutlinedHashTreeRecord.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/WithColor.h" + +#define DEBUG_TYPE "cg-data" + +using namespace llvm; +using namespace cgdata; + +static std::string getCGDataErrString(cgdata_error Err, + const std::string &ErrMsg = "") { + std::string Msg; + raw_string_ostream OS(Msg); + + switch (Err) { + case cgdata_error::success: + OS << "success"; + break; + case cgdata_error::eof: + OS << "end of File"; + break; + case cgdata_error::bad_magic: + OS << "invalid codegen data (bad magic)"; + break; + case cgdata_error::bad_header: + OS << "invalid codegen data (file header is corrupt)"; + break; + case cgdata_error::empty_cgdata: + OS << "empty codegen data"; + break; + case cgdata_error::malformed: + OS << "malformed codegen data"; + break; + case cgdata_error::unsupported_version: + OS << "unsupported codegen data version"; + break; + } + + // If optional error message is not empty, append it to the message. + if (!ErrMsg.empty()) + OS << ": " << ErrMsg; + + return OS.str(); +} + +namespace { + +// FIXME: This class is only here to support the transition to llvm::Error. It +// will be removed once this transition is complete. Clients should prefer to +// deal with the Error value directly, rather than converting to error_code. +class CGDataErrorCategoryType : public std::error_category { + const char *name() const noexcept override { return "llvm.cgdata"; } + + std::string message(int IE) const override { + return getCGDataErrString(static_cast(IE)); + } +}; + +} // end anonymous namespace + +const std::error_category &llvm::cgdata_category() { + static CGDataErrorCategoryType ErrorCategory; + return ErrorCategory; +} + +std::string CGDataError::message() const { + return getCGDataErrString(Err, Msg); +} + +char CGDataError::ID = 0; + +namespace { + +const char *CodeGenDataSectNameCommon[] = { +#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix) \ + SectNameCommon, +#include "llvm/CodeGenData/CodeGenData.inc" +}; + +const char *CodeGenDataSectNameCoff[] = { +#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix) \ + SectNameCoff, +#include "llvm/CodeGenData/CodeGenData.inc" +}; + +const char *CodeGenDataSectNamePrefix[] = { +#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix) Prefix, +#include "llvm/CodeGenData/CodeGenData.inc" +}; + +} // namespace + +namespace llvm { + +std::string getCodeGenDataSectionName(CGDataSectKind CGSK, + Triple::ObjectFormatType OF, + bool AddSegmentInfo) { + std::string SectName; + + if (OF == Triple::MachO && AddSegmentInfo) + SectName = CodeGenDataSectNamePrefix[CGSK]; + + if (OF == Triple::COFF) + SectName += CodeGenDataSectNameCoff[CGSK]; + else + SectName += CodeGenDataSectNameCommon[CGSK]; + + return SectName; +} + +std::unique_ptr CodeGenData::Instance = nullptr; +std::once_flag CodeGenData::OnceFlag; + +CodeGenData &CodeGenData::getInstance() { + std::call_once(CodeGenData::OnceFlag, []() { + auto *CGD = new CodeGenData(); + Instance.reset(CGD); + + // TODO: Initialize writer or reader mode for the client optimization. + }); + return *(Instance.get()); +} + +namespace IndexedCGData { + +Expected
Header::readFromBuffer(const unsigned char *Curr) { + using namespace support; + + static_assert(std::is_standard_layout_v, + "The header should be standard layout type since we use offset " + "of fields to read."); + Header H; + H.Magic = endian::readNext(Curr); + if (H.Magic != IndexedCGData::Magic) + return make_error(cgdata_error::bad_magic); + H.Version = endian::readNext(Curr); + if (H.Version > IndexedCGData::CGDataVersion::CurrentVersion) + return make_error(cgdata_error::unsupported_version); + H.DataKind = endian::readNext(Curr); + + switch (H.Version) { + // When a new field is added to the header add a case statement here to + // compute the size as offset of the new field + size of the new field. This + // relies on the field being added to the end of the list. + static_assert(IndexedCGData::CGDataVersion::CurrentVersion == Version1, + "Please update the size computation below if a new field has " + "been added to the header, if not add a case statement to " + "fall through to the latest version."); + case 1ull: + H.OutlinedHashTreeOffset = + endian::readNext(Curr); + } + + return H; +} + +} // end namespace IndexedCGData + +namespace cgdata { + +void warn(Twine Message, std::string Whence, std::string Hint) { + WithColor::warning(); + if (!Whence.empty()) + errs() << Whence << ": "; + errs() << Message << "\n"; + if (!Hint.empty()) + WithColor::note() << Hint << "\n"; +} + +void warn(Error E, StringRef Whence) { + if (E.isA()) { + handleAllErrors(std::move(E), [&](const CGDataError &IPE) { + warn(IPE.message(), std::string(Whence), std::string("")); + }); + } +} + +} // end namespace cgdata + +} // end namespace llvm diff --git a/llvm/lib/CodeGenData/CodeGenDataReader.cpp b/llvm/lib/CodeGenData/CodeGenDataReader.cpp new file mode 100644 index 00000000000000..1b08085dec2f25 --- /dev/null +++ b/llvm/lib/CodeGenData/CodeGenDataReader.cpp @@ -0,0 +1,174 @@ +//===- CodeGenDataReader.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains support for reading codegen data. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGenData/CodeGenDataReader.h" +#include "llvm/CodeGenData/OutlinedHashTreeRecord.h" +#include "llvm/Support/MemoryBuffer.h" + +#define DEBUG_TYPE "cg-data-reader" + +using namespace llvm; + +namespace llvm { + +static Expected> +setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) { + auto BufferOrErr = Filename.str() == "-" ? MemoryBuffer::getSTDIN() + : FS.getBufferForFile(Filename); + if (std::error_code EC = BufferOrErr.getError()) + return errorCodeToError(EC); + return std::move(BufferOrErr.get()); +} + +Error CodeGenDataReader::mergeFromObjectFile( + const object::ObjectFile *Obj, + OutlinedHashTreeRecord &GlobalOutlineRecord) { + Triple TT = Obj->makeTriple(); + auto CGOutLineName = + getCodeGenDataSectionName(CG_outline, TT.getObjectFormat(), false); + + for (auto &Section : Obj->sections()) { + Expected NameOrErr = Section.getName(); + if (!NameOrErr) + return NameOrErr.takeError(); + Expected ContentsOrErr = Section.getContents(); + if (!ContentsOrErr) + return ContentsOrErr.takeError(); + auto *Data = reinterpret_cast(ContentsOrErr->data()); + auto *EndData = Data + ContentsOrErr->size(); + + if (*NameOrErr == CGOutLineName) { + // In case dealing with an executable that has concatenaed cgdata, + // we want to merge them into a single cgdata. + // Although it's not a typical workflow, we support this scenario. + while (Data != EndData) { + OutlinedHashTreeRecord LocalOutlineRecord; + LocalOutlineRecord.deserialize(Data); + GlobalOutlineRecord.merge(LocalOutlineRecord); + } + } + // TODO: Add support for other cgdata sections. + } + + return Error::success(); +} + +Error IndexedCodeGenDataReader::read() { + using namespace support; + + // The smallest header with the version 1 is 24 bytes + const unsigned MinHeaderSize = 24; + if (DataBuffer->getBufferSize() < MinHeaderSize) + return error(cgdata_error::bad_header); + + auto *Start = + reinterpret_cast(DataBuffer->getBufferStart()); + auto *End = + reinterpret_cast(DataBuffer->getBufferEnd()); + auto HeaderOr = IndexedCGData::Header::readFromBuffer(Start); + if (!HeaderOr) + return HeaderOr.takeError(); + Header = HeaderOr.get(); + + if (hasOutlinedHashTree()) { + const unsigned char *Ptr = Start + Header.OutlinedHashTreeOffset; + if (Ptr >= End) + return error(cgdata_error::eof); + HashTreeRecord.deserialize(Ptr); + } + + return success(); +} + +Expected> +CodeGenDataReader::create(const Twine &Path, vfs::FileSystem &FS) { + // Set up the buffer to read. + auto BufferOrError = setupMemoryBuffer(Path, FS); + if (Error E = BufferOrError.takeError()) + return std::move(E); + return CodeGenDataReader::create(std::move(BufferOrError.get())); +} + +Expected> +CodeGenDataReader::create(std::unique_ptr Buffer) { + if (Buffer->getBufferSize() == 0) + return make_error(cgdata_error::empty_cgdata); + + std::unique_ptr Reader; + // Create the reader. + if (IndexedCodeGenDataReader::hasFormat(*Buffer)) + Reader.reset(new IndexedCodeGenDataReader(std::move(Buffer))); + else if (TextCodeGenDataReader::hasFormat(*Buffer)) + Reader.reset(new TextCodeGenDataReader(std::move(Buffer))); + else + return make_error(cgdata_error::malformed); + + // Initialize the reader and return the result. + if (Error E = Reader->read()) + return std::move(E); + + return std::move(Reader); +} + +bool IndexedCodeGenDataReader::hasFormat(const MemoryBuffer &DataBuffer) { + using namespace support; + if (DataBuffer.getBufferSize() < 8) + return false; + + uint64_t Magic = endian::read( + DataBuffer.getBufferStart()); + // Verify that it's magical. + return Magic == IndexedCGData::Magic; +} + +bool TextCodeGenDataReader::hasFormat(const MemoryBuffer &Buffer) { + // Verify that this really looks like plain ASCII text by checking a + // 'reasonable' number of characters (up to profile magic size). + size_t count = std::min(Buffer.getBufferSize(), sizeof(uint64_t)); + StringRef buffer = Buffer.getBufferStart(); + return count == 0 || + std::all_of(buffer.begin(), buffer.begin() + count, + [](char c) { return isPrint(c) || isSpace(c); }); +} +Error TextCodeGenDataReader::read() { + using namespace support; + + // Parse the custom header line by line. + while (Line->starts_with(":")) { + StringRef Str = Line->substr(1); + if (Str.equals_insensitive("outlined_hash_tree")) + DataKind |= CGDataKind::FunctionOutlinedHashTree; + else + return error(cgdata_error::bad_header); + ++Line; + } + + // We treat an empty header (that as a comment # only) as a valid header. + if (Line.is_at_eof()) { + if (DataKind != CGDataKind::Unknown) + return error(cgdata_error::bad_header); + return Error::success(); + } + + // The YAML docs follow after the header. + const char *Pos = (*Line).data(); + size_t Size = reinterpret_cast(DataBuffer->getBufferEnd()) - + reinterpret_cast(Pos); + yaml::Input YOS(StringRef(Pos, Size)); + if (hasOutlinedHashTree()) + HashTreeRecord.deserializeYAML(YOS); + + // TODO: Add more yaml cgdata in order + + return Error::success(); +} +} // end namespace llvm diff --git a/llvm/lib/CodeGenData/CodeGenDataWriter.cpp b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp new file mode 100644 index 00000000000000..9aa0d86223f714 --- /dev/null +++ b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp @@ -0,0 +1,162 @@ +//===- CodeGenDataWriter.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains support for writing codegen data. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGenData/CodeGenDataWriter.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/EndianStream.h" + +#define DEBUG_TYPE "cg-data-writer" + +using namespace llvm; + +namespace llvm { + +/// A struct to define how the data stream should be patched. +struct CGDataPatchItem { + uint64_t Pos; // Where to patch. + uint64_t *D; // Pointer to an array of source data. + int N; // Number of elements in \c D array. +}; + +// A wrapper class to abstract writer stream with support of bytes +// back patching. +class CGDataOStream { +public: + CGDataOStream(raw_fd_ostream &FD) + : IsFDOStream(true), OS(FD), LE(FD, llvm::endianness::little) {} + CGDataOStream(raw_string_ostream &STR) + : IsFDOStream(false), OS(STR), LE(STR, llvm::endianness::little) {} + + uint64_t tell() { return OS.tell(); } + void write(uint64_t V) { LE.write(V); } + void write32(uint32_t V) { LE.write(V); } + void write8(uint8_t V) { LE.write(V); } + + // \c patch can only be called when all data is written and flushed. + // For raw_string_ostream, the patch is done on the target string + // directly and it won't be reflected in the stream's internal buffer. + void patch(ArrayRef P) { + using namespace support; + + if (IsFDOStream) { + raw_fd_ostream &FDOStream = static_cast(OS); + const uint64_t LastPos = FDOStream.tell(); + for (const auto &K : P) { + FDOStream.seek(K.Pos); + for (int I = 0; I < K.N; I++) + write(K.D[I]); + } + // Reset the stream to the last position after patching so that users + // don't accidentally overwrite data. This makes it consistent with + // the string stream below which replaces the data directly. + FDOStream.seek(LastPos); + } else { + raw_string_ostream &SOStream = static_cast(OS); + std::string &Data = SOStream.str(); // with flush + for (const auto &K : P) { + for (int I = 0; I < K.N; I++) { + uint64_t Bytes = + endian::byte_swap(K.D[I]); + Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t), + (const char *)&Bytes, sizeof(uint64_t)); + } + } + } + } + + // If \c OS is an instance of \c raw_fd_ostream, this field will be + // true. Otherwise, \c OS will be an raw_string_ostream. + bool IsFDOStream; + raw_ostream &OS; + support::endian::Writer LE; +}; + +} // end namespace llvm + +void CodeGenDataWriter::addRecord(OutlinedHashTreeRecord &Record) { + assert(Record.HashTree && "empty hash tree in the record"); + HashTreeRecord.HashTree = std::move(Record.HashTree); + + DataKind |= CGDataKind::FunctionOutlinedHashTree; +} + +Error CodeGenDataWriter::write(raw_fd_ostream &OS) { + CGDataOStream COS(OS); + return writeImpl(COS); +} + +Error CodeGenDataWriter::writeHeader(CGDataOStream &COS) { + using namespace support; + IndexedCGData::Header Header; + Header.Magic = IndexedCGData::Magic; + Header.Version = IndexedCGData::Version; + + // Set the CGDataKind depending on the kind. + Header.DataKind = 0; + if (static_cast(DataKind & CGDataKind::FunctionOutlinedHashTree)) + Header.DataKind |= + static_cast(CGDataKind::FunctionOutlinedHashTree); + + Header.OutlinedHashTreeOffset = 0; + + // Only write out up to the CGDataKind. We need to remember the offest of the + // remaing fields to allow back patching later. + COS.write(Header.Magic); + COS.write32(Header.Version); + COS.write32(Header.DataKind); + + // Save the location of Header.OutlinedHashTreeOffset field in \c COS. + OutlinedHashTreeOffset = COS.tell(); + + // Reserve the space for OutlinedHashTreeOffset field. + COS.write(0); + + return Error::success(); +} + +Error CodeGenDataWriter::writeImpl(CGDataOStream &COS) { + if (Error E = writeHeader(COS)) + return E; + + uint64_t OutlinedHashTreeFieldStart = COS.tell(); + if (hasOutlinedHashTree()) + HashTreeRecord.serialize(COS.OS); + + // Back patch the offsets. + CGDataPatchItem PatchItems[] = { + {OutlinedHashTreeOffset, &OutlinedHashTreeFieldStart, 1}}; + COS.patch(PatchItems); + + return Error::success(); +} + +Error CodeGenDataWriter::writeHeaderText(raw_fd_ostream &OS) { + if (hasOutlinedHashTree()) + OS << "# Outlined stable hash tree\n:outlined_hash_tree\n"; + + // TODO: Add more data types in this header + + return Error::success(); +} + +Error CodeGenDataWriter::writeText(raw_fd_ostream &OS) { + if (Error E = writeHeaderText(OS)) + return E; + + yaml::Output YOS(OS); + if (hasOutlinedHashTree()) + HashTreeRecord.serializeYAML(YOS); + + // TODO: Write more yaml cgdata in order + + return Error::success(); +} diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index 6127b76db06b7f..be777ce650e874 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -73,6 +73,7 @@ set(LLVM_TEST_DEPENDS llvm-c-test llvm-cat llvm-cfi-verify + llvm-cgdata llvm-config llvm-cov llvm-cvtres diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 4c05317036d1a3..cc7e9d535a9c33 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -180,6 +180,7 @@ def get_asan_rtlib(): "llvm-addr2line", "llvm-bcanalyzer", "llvm-bitcode-strip", + "llvm-cgdata", "llvm-config", "llvm-cov", "llvm-cxxdump", diff --git a/llvm/test/tools/llvm-cgdata/dump.test b/llvm/test/tools/llvm-cgdata/dump.test new file mode 100644 index 00000000000000..ce2ad27a5ff81c --- /dev/null +++ b/llvm/test/tools/llvm-cgdata/dump.test @@ -0,0 +1,30 @@ +# Test dump between the binary and text formats. + +RUN: split-file %s %t + +RUN: llvm-cgdata dump -binary %t/dump.cgtext -o %t/dump.cgdata +RUN: llvm-cgdata dump -text %t/dump.cgdata -o %t/dump-round.cgtext +RUN: llvm-cgdata dump -binary %t/dump-round.cgtext -o %t/dump-round.cgdata +RUN: diff %t/dump.cgdata %t/dump-round.cgdata + +;--- dump.cgtext +# Outlined stable hash tree +:outlined_hash_tree +--- +0: + Hash: 0x0 + Terminals: 0 + SuccessorIds: [ 1 ] +1: + Hash: 0x1 + Terminals: 0 + SuccessorIds: [ 2, 3 ] +2: + Hash: 0x3 + Terminals: 5 + SuccessorIds: [ ] +3: + Hash: 0x2 + Terminals: 4 + SuccessorIds: [ ] +... diff --git a/llvm/test/tools/llvm-cgdata/empty.test b/llvm/test/tools/llvm-cgdata/empty.test new file mode 100644 index 00000000000000..d5e201b9eec17f --- /dev/null +++ b/llvm/test/tools/llvm-cgdata/empty.test @@ -0,0 +1,32 @@ +# Test for empty cgdata file, which is invalid. +RUN: touch %t_emptyfile.cgtext +RUN: not llvm-cgdata dump %t_emptyfile.cgtext -text -o - 2>&1 | FileCheck %s --check-prefix ERROR +ERROR: {{.}}emptyfile.cgtext: empty codegen data + +# Test for empty header in the text format. It can be converted to a valid binary file. +RUN: printf '#' > %t_emptyheader.cgtext +RUN: llvm-cgdata dump %t_emptyheader.cgtext -binary -o %t_emptyheader.cgdata + +# Without any cgdata other than the header, no data shows by default. +RUN: llvm-cgdata show %t_emptyheader.cgdata | FileCheck %s --allow-empty --check-prefix EMPTY +EMPTY-NOT: any + +# The version number appears when asked, as it's in the header +RUN: llvm-cgdata show --cgdata-version %t_emptyheader.cgdata | FileCheck %s --check-prefix VERSION +VERSION: Version: {{.}} + +# When converting a binary file (w/ the header only) to a text file, it's an empty file as the text format does not have an explicit header. +RUN: llvm-cgdata dump %t_emptyheader.cgdata -text -o - | FileCheck %s --allow-empty --check-prefix EMPTY + +# Synthesize a header only cgdata. +# struct Header { +# uint64_t Magic; +# uint32_t Version; +# uint32_t DataKind; +# uint64_t OutlinedHashTreeOffset; +# } +RUN: printf '\xffcgdata\x81' > %t_header.cgdata +RUN: printf '\x01\x00\x00\x00' >> %t_header.cgdata +RUN: printf '\x00\x00\x00\x00' >> %t_header.cgdata +RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_header.cgdata +RUN: diff %t_header.cgdata %t_emptyheader.cgdata diff --git a/llvm/test/tools/llvm-cgdata/error.test b/llvm/test/tools/llvm-cgdata/error.test new file mode 100644 index 00000000000000..5e1b14de5e509d --- /dev/null +++ b/llvm/test/tools/llvm-cgdata/error.test @@ -0,0 +1,38 @@ +# Test various error cases + +# Synthesize a header only cgdata. +# struct Header { +# uint64_t Magic; +# uint32_t Version; +# uint32_t DataKind; +# uint64_t OutlinedHashTreeOffset; +# } +RUN: touch %t_empty.cgdata +RUN: not llvm-cgdata show %t_empty.cgdata 2>&1 | FileCheck %s --check-prefix EMPTY +EMPTY: {{.}}cgdata: empty codegen data + +# Not a magic. +RUN: printf '\xff' > %t_malformed.cgdata +RUN: not llvm-cgdata show %t_malformed.cgdata 2>&1 | FileCheck %s --check-prefix MALFORMED +MALFORMED: {{.}}cgdata: malformed codegen data + +# The minimum header size is 24. +RUN: printf '\xffcgdata\x81' > %t_corrupt.cgdata +RUN: not llvm-cgdata show %t_corrupt.cgdata 2>&1 | FileCheck %s --check-prefix CORRUPT +CORRUPT: {{.}}cgdata: invalid codegen data (file header is corrupt) + +# The current version 1 while the header says 2. +RUN: printf '\xffcgdata\x81' > %t_version.cgdata +RUN: printf '\x02\x00\x00\x00' >> %t_version.cgdata +RUN: printf '\x00\x00\x00\x00' >> %t_version.cgdata +RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata +RUN: not llvm-cgdata show %t_version.cgdata 2>&1 | FileCheck %s --check-prefix BAD_VERSION +BAD_VERSION: {{.}}cgdata: unsupported codegen data version + +# Header says an outlined hash tree, but the file ends after the header. +RUN: printf '\xffcgdata\x81' > %t_eof.cgdata +RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata +RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata +RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_eof.cgdata +RUN: not llvm-cgdata show %t_eof.cgdata 2>&1 | FileCheck %s --check-prefix EOF +EOF: {{.}}cgdata: end of File diff --git a/llvm/test/tools/llvm-cgdata/merge-archive.test b/llvm/test/tools/llvm-cgdata/merge-archive.test new file mode 100644 index 00000000000000..a27d6c2a16f4ab --- /dev/null +++ b/llvm/test/tools/llvm-cgdata/merge-archive.test @@ -0,0 +1,75 @@ +# Merge an archive that has two object files having cgdata (__llvm_outline) + +RUN: split-file %s %t + +RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o +RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o +RUN: llvm-ar rcs %t/merge-archive.a %t/merge-1.o %t/merge-2.o +RUN: llvm-cgdata merge %t/merge-archive.a -o %t/merge-archive.cgdata +RUN: llvm-cgdata show %t/merge-archive.cgdata | FileCheck %s +CHECK: Outlined hash tree: +CHECK-NEXT: Total Node Count: 4 +CHECK-NEXT: Terminal Node Count: 2 +CHECK-NEXT: Depth: 2 + +RUN: llvm-cgdata dump %t/merge-archive.cgdata | FileCheck %s --check-prefix TREE +TREE: # Outlined stable hash tree +TREE-NEXT: :outlined_hash_tree +TREE-NEXT: --- +TREE-NEXT: 0: +TREE-NEXT: Hash: 0x0 +TREE-NEXT: Terminals: 0 +TREE-NEXT: SuccessorIds: [ 1 ] +TREE-NEXT: 1: +TREE-NEXT: Hash: 0x1 +TREE-NEXT: Terminals: 0 +TREE-NEXT: SuccessorIds: [ 2, 3 ] +TREE-NEXT: 2: +TREE-NEXT: Hash: 0x3 +TREE-NEXT: Terminals: 5 +TREE-NEXT: SuccessorIds: [ ] +TREE-NEXT: 3: +TREE-NEXT: Hash: 0x2 +TREE-NEXT: Terminals: 4 +TREE-NEXT: SuccessorIds: [ ] +TREE-NEXT: ... + +;--- merge-1.ll + +; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp +;--- +;0: +; Hash: 0x0 +; Terminals: 0 +; SuccessorIds: [ 1 ] +;1: +; Hash: 0x1 +; Terminals: 0 +; SuccessorIds: [ 2 ] +;2: +; Hash: 0x2 +; Terminals: 4 +; SuccessorIds: [ ] +;... + +@.data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline" + +;--- merge-2.ll + +; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp +;--- +;0: +; Hash: 0x0 +; Terminals: 0 +; SuccessorIds: [ 1 ] +;1: +; Hash: 0x1 +; Terminals: 0 +; SuccessorIds: [ 2 ] +;2: +; Hash: 0x3 +; Terminals: 5 +; SuccessorIds: [ ] +;... + +@.data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline" diff --git a/llvm/test/tools/llvm-cgdata/merge-concat.test b/llvm/test/tools/llvm-cgdata/merge-concat.test new file mode 100644 index 00000000000000..3411133cb7aacb --- /dev/null +++ b/llvm/test/tools/llvm-cgdata/merge-concat.test @@ -0,0 +1,68 @@ +# Merge a binary file (e.g., a linked executable) having concatnated cgdata (__llvm_outline) + +RUN: split-file %s %t + +RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-concat.ll -o %t/merge-concat.o +RUN: llvm-cgdata merge %t/merge-concat.o -o %t/merge-concat.cgdata +RUN: llvm-cgdata show %t/merge-concat.cgdata | FileCheck %s +CHECK: Outlined hash tree: +CHECK-NEXT: Total Node Count: 4 +CHECK-NEXT: Terminal Node Count: 2 +CHECK-NEXT: Depth: 2 + +RUN: llvm-cgdata dump %t/merge-concat.cgdata | FileCheck %s --check-prefix TREE +TREE: # Outlined stable hash tree +TREE-NEXT: :outlined_hash_tree +TREE-NEXT: --- +TREE-NEXT: 0: +TREE-NEXT: Hash: 0x0 +TREE-NEXT: Terminals: 0 +TREE-NEXT: SuccessorIds: [ 1 ] +TREE-NEXT: 1: +TREE-NEXT: Hash: 0x1 +TREE-NEXT: Terminals: 0 +TREE-NEXT: SuccessorIds: [ 2, 3 ] +TREE-NEXT: 2: +TREE-NEXT: Hash: 0x3 +TREE-NEXT: Terminals: 5 +TREE-NEXT: SuccessorIds: [ ] +TREE-NEXT: 3: +TREE-NEXT: Hash: 0x2 +TREE-NEXT: Terminals: 4 +TREE-NEXT: SuccessorIds: [ ] +TREE-NEXT: ... + +;--- merge-concat.ll + +; In an linked executable (as opposed to an object file), cgdata in __llvm_outline might be concatenated. Although this is not a typical workflow, we simply support this case to parse cgdata that is concatenated. In other word, the following two trees are encoded back-to-back in a binary format. +;--- +;0: +; Hash: 0x0 +; Terminals: 0 +; SuccessorIds: [ 1 ] +;1: +; Hash: 0x1 +; Terminals: 0 +; SuccessorIds: [ 2 ] +;2: +; Hash: 0x2 +; Terminals: 4 +; SuccessorIds: [ ] +;... +;--- +;0: +; Hash: 0x0 +; Terminals: 0 +; SuccessorIds: [ 1 ] +;1: +; Hash: 0x1 +; Terminals: 0 +; SuccessorIds: [ 2 ] +;2: +; Hash: 0x3 +; Terminals: 5 +; SuccessorIds: [ ] +;... + +@.data1 = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline" +@.data2 = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline" diff --git a/llvm/test/tools/llvm-cgdata/merge-double.test b/llvm/test/tools/llvm-cgdata/merge-double.test new file mode 100644 index 00000000000000..6ce358cd72325b --- /dev/null +++ b/llvm/test/tools/llvm-cgdata/merge-double.test @@ -0,0 +1,74 @@ +# Merge two object files having cgdata (__llvm_outline) + +RUN: split-file %s %t + +RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o +RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o +RUN: llvm-cgdata merge %t/merge-1.o %t/merge-2.o -o %t/merge.cgdata +RUN: llvm-cgdata show %t/merge.cgdata | FileCheck %s +CHECK: Outlined hash tree: +CHECK-NEXT: Total Node Count: 4 +CHECK-NEXT: Terminal Node Count: 2 +CHECK-NEXT: Depth: 2 + +RUN: llvm-cgdata dump %t/merge.cgdata | FileCheck %s --check-prefix TREE +TREE: # Outlined stable hash tree +TREE-NEXT: :outlined_hash_tree +TREE-NEXT: --- +TREE-NEXT: 0: +TREE-NEXT: Hash: 0x0 +TREE-NEXT: Terminals: 0 +TREE-NEXT: SuccessorIds: [ 1 ] +TREE-NEXT: 1: +TREE-NEXT: Hash: 0x1 +TREE-NEXT: Terminals: 0 +TREE-NEXT: SuccessorIds: [ 2, 3 ] +TREE-NEXT: 2: +TREE-NEXT: Hash: 0x3 +TREE-NEXT: Terminals: 5 +TREE-NEXT: SuccessorIds: [ ] +TREE-NEXT: 3: +TREE-NEXT: Hash: 0x2 +TREE-NEXT: Terminals: 4 +TREE-NEXT: SuccessorIds: [ ] +TREE-NEXT: ... + +;--- merge-1.ll + +; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp +;--- +;0: +; Hash: 0x0 +; Terminals: 0 +; SuccessorIds: [ 1 ] +;1: +; Hash: 0x1 +; Terminals: 0 +; SuccessorIds: [ 2 ] +;2: +; Hash: 0x2 +; Terminals: 4 +; SuccessorIds: [ ] +;... + +@.data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline" + +;--- merge-2.ll + +; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp +;--- +;0: +; Hash: 0x0 +; Terminals: 0 +; SuccessorIds: [ 1 ] +;1: +; Hash: 0x1 +; Terminals: 0 +; SuccessorIds: [ 2 ] +;2: +; Hash: 0x3 +; Terminals: 5 +; SuccessorIds: [ ] +;... + +@.data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline" diff --git a/llvm/test/tools/llvm-cgdata/merge-single.test b/llvm/test/tools/llvm-cgdata/merge-single.test new file mode 100644 index 00000000000000..73bdd9800dbe1d --- /dev/null +++ b/llvm/test/tools/llvm-cgdata/merge-single.test @@ -0,0 +1,43 @@ +# Test merge a single object file into a cgdata + +RUN: split-file %s %t + +# Merge an object file that has no cgdata (__llvm_outline). It still produces a header only cgdata. +RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-empty.ll -o %t/merge-empty.o +RUN: llvm-cgdata merge %t/merge-empty.o -o %t/merge-empty.cgdata +RUN: llvm-cgdata show %t/merge-empty.cgdata | FileCheck %s --allow-empty --check-prefix EMPTY +EMPTY-NOT: any + + +# Merge an object file having cgdata (__llvm_outline) +RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merge-single.o +RUN: llvm-cgdata merge %t/merge-single.o -o %t/merge-single.cgdata +RUN: llvm-cgdata show %t/merge-single.cgdata | FileCheck %s +CHECK: Outlined hash tree: +CHECK-NEXT: Total Node Count: 3 +CHECK-NEXT: Terminal Node Count: 1 +CHECK-NEXT: Depth: 2 + +;--- merge-empty.ll +@.data = private unnamed_addr constant [1 x i8] c"\01" + +;--- merge-single.ll + +; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp +;--- +;0: +; Hash: 0x0 +; Terminals: 0 +; SuccessorIds: [ 1 ] +;1: +; Hash: 0x1 +; Terminals: 0 +; SuccessorIds: [ 2 ] +;2: +; Hash: 0x2 +; Terminals: 4 +; SuccessorIds: [ ] +;... + +@.data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline" + diff --git a/llvm/test/tools/llvm-cgdata/show.test b/llvm/test/tools/llvm-cgdata/show.test new file mode 100644 index 00000000000000..accb4b77ede246 --- /dev/null +++ b/llvm/test/tools/llvm-cgdata/show.test @@ -0,0 +1,30 @@ +# Test show + +RUN: split-file %s %t +RUN: llvm-cgdata show %t/show.cgtext | FileCheck %s + +CHECK: Outlined hash tree: +CHECK-NEXT: Total Node Count: 3 +CHECK-NEXT: Terminal Node Count: 1 +CHECK-NEXT: Depth: 2 + +# Convert the text file to the binary file +RUN: llvm-cgdata dump -binary %t/show.cgtext -o %t/show.cgdata +RUN: llvm-cgdata show %t/show.cgdata | FileCheck %s + +;--- show.cgtext +:outlined_hash_tree +--- +0: + Hash: 0x0 + Terminals: 0 + SuccessorIds: [ 1 ] +1: + Hash: 0x1 + Terminals: 0 + SuccessorIds: [ 2 ] +2: + Hash: 0x2 + Terminals: 3 + SuccessorIds: [ ] +... diff --git a/llvm/tools/llvm-cgdata/CMakeLists.txt b/llvm/tools/llvm-cgdata/CMakeLists.txt new file mode 100644 index 00000000000000..4f1f7ff635bc3c --- /dev/null +++ b/llvm/tools/llvm-cgdata/CMakeLists.txt @@ -0,0 +1,15 @@ +set(LLVM_LINK_COMPONENTS + CodeGen + CodeGenData + Core + Object + Support + ) + +add_llvm_tool(llvm-cgdata + llvm-cgdata.cpp + + DEPENDS + intrinsics_gen + GENERATE_DRIVER + ) diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp new file mode 100644 index 00000000000000..195f066fd6b872 --- /dev/null +++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp @@ -0,0 +1,268 @@ +//===-- llvm-cgdata.cpp - LLVM CodeGen Data Tool --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// llvm-cgdata parses raw codegen data embedded in compiled binary files, and +// merges them into a single .cgdata file. It can also inspect and maninuplate +// a .cgdata file. This .cgdata can contain various codegen data like outlining +// information, and it can be used to optimize the code in the subsequent build. +// +//===----------------------------------------------------------------------===// +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGenData/CodeGenDataReader.h" +#include "llvm/CodeGenData/CodeGenDataWriter.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Object/Archive.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/LLVMDriver.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Support/WithColor.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace llvm::object; + +// TODO: https://llvm.org/docs/CommandGuide/llvm-cgdata.html has documentations +// on each subcommand. +cl::SubCommand DumpSubcommand( + "dump", + "Dump the (indexed) codegen data file in either text or binary format."); +cl::SubCommand MergeSubcommand( + "merge", "Takes binary files having raw codegen data in custom sections, " + "and merge them into an index codegen data file."); +cl::SubCommand + ShowSubcommand("show", "Show summary of the (indexed) codegen data file."); + +enum CGDataFormat { + CD_None = 0, + CD_Text, + CD_Binary, +}; + +cl::opt OutputFilename("output", cl::value_desc("output"), + cl::init("-"), cl::desc("Output file"), + cl::sub(DumpSubcommand), + cl::sub(MergeSubcommand)); +cl::alias OutputFilenameA("o", cl::desc("Alias for --output"), + cl::aliasopt(OutputFilename)); + +cl::opt Filename(cl::Positional, cl::desc(""), + cl::sub(DumpSubcommand), cl::sub(ShowSubcommand)); +cl::list InputFilenames(cl::Positional, cl::sub(MergeSubcommand), + cl::desc("")); +cl::opt OutputFormat( + cl::desc("Format of output data"), cl::sub(DumpSubcommand), + cl::init(CD_Text), + cl::values(clEnumValN(CD_Text, "text", "Text encoding"), + clEnumValN(CD_Binary, "binary", "Binary encoding"))); + +cl::opt ShowCGDataVersion("cgdata-version", cl::init(false), + cl::desc("Show cgdata version. "), + cl::sub(ShowSubcommand)); + +static void exitWithError(Twine Message, std::string Whence = "", + std::string Hint = "") { + WithColor::error(); + if (!Whence.empty()) + errs() << Whence << ": "; + errs() << Message << "\n"; + if (!Hint.empty()) + WithColor::note() << Hint << "\n"; + ::exit(1); +} + +static void exitWithError(Error E, StringRef Whence = "") { + if (E.isA()) { + handleAllErrors(std::move(E), [&](const CGDataError &IPE) { + exitWithError(IPE.message(), std::string(Whence)); + }); + return; + } + + exitWithError(toString(std::move(E)), std::string(Whence)); +} + +static void exitWithErrorCode(std::error_code EC, StringRef Whence = "") { + exitWithError(EC.message(), std::string(Whence)); +} + +static int dump_main(int argc, const char *argv[]) { + if (Filename == OutputFilename) { + errs() << sys::path::filename(argv[0]) << " " << argv[1] + << ": Input file name cannot be the same as the output file name!\n"; + return 1; + } + + std::error_code EC; + raw_fd_ostream OS(OutputFilename.data(), EC, + OutputFormat == CD_Text ? sys::fs::OF_TextWithCRLF + : sys::fs::OF_None); + if (EC) + exitWithErrorCode(EC, OutputFilename); + + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = CodeGenDataReader::create(Filename, *FS); + if (Error E = ReaderOrErr.takeError()) + exitWithError(std::move(E), Filename); + + CodeGenDataWriter Writer; + auto Reader = ReaderOrErr->get(); + if (Reader->hasOutlinedHashTree()) { + OutlinedHashTreeRecord Record(Reader->releaseOutlinedHashTree()); + Writer.addRecord(Record); + } + + if (OutputFormat == CD_Text) { + if (Error E = Writer.writeText(OS)) + exitWithError(std::move(E)); + } else { + if (Error E = Writer.write(OS)) + exitWithError(std::move(E)); + } + + return 0; +} + +static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer, + OutlinedHashTreeRecord &GlobalOutlineRecord); + +static bool handleArchive(StringRef Filename, Archive &Arch, + OutlinedHashTreeRecord &GlobalOutlineRecord) { + bool Result = true; + Error Err = Error::success(); + for (const auto &Child : Arch.children(Err)) { + auto BuffOrErr = Child.getMemoryBufferRef(); + if (Error E = BuffOrErr.takeError()) + exitWithError(std::move(E), Filename); + auto NameOrErr = Child.getName(); + if (Error E = NameOrErr.takeError()) + exitWithError(std::move(E), Filename); + std::string Name = (Filename + "(" + NameOrErr.get() + ")").str(); + Result &= handleBuffer(Name, BuffOrErr.get(), GlobalOutlineRecord); + } + if (Err) + exitWithError(std::move(Err), Filename); + return Result; +} + +static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer, + OutlinedHashTreeRecord &GlobalOutlineRecord) { + Expected> BinOrErr = object::createBinary(Buffer); + if (Error E = BinOrErr.takeError()) + exitWithError(std::move(E), Filename); + + bool Result = true; + if (auto *Obj = dyn_cast(BinOrErr->get())) { + if (Error E = + CodeGenDataReader::mergeFromObjectFile(Obj, GlobalOutlineRecord)) + exitWithError(std::move(E), Filename); + } else if (auto *Arch = dyn_cast(BinOrErr->get())) { + Result &= handleArchive(Filename, *Arch, GlobalOutlineRecord); + } else { + // TODO: Support for the MachO universal binary format. + errs() << "Error: unsupported binary file: " << Filename << "\n"; + Result = false; + } + + return Result; +} + +static bool handleFile(StringRef Filename, + OutlinedHashTreeRecord &GlobalOutlineRecord) { + ErrorOr> BuffOrErr = + MemoryBuffer::getFileOrSTDIN(Filename); + if (std::error_code EC = BuffOrErr.getError()) + exitWithErrorCode(EC, Filename); + return handleBuffer(Filename, *BuffOrErr.get(), GlobalOutlineRecord); +} + +static int merge_main(int argc, const char *argv[]) { + bool Result = true; + OutlinedHashTreeRecord GlobalOutlineRecord; + for (auto &Filename : InputFilenames) + Result &= handleFile(Filename, GlobalOutlineRecord); + + if (!Result) { + errs() << "Error: failed to merge codegen data files.\n"; + return 1; + } + + CodeGenDataWriter Writer; + if (!GlobalOutlineRecord.empty()) + Writer.addRecord(GlobalOutlineRecord); + + std::error_code EC; + raw_fd_ostream Output(OutputFilename, EC, sys::fs::OF_None); + if (EC) + exitWithErrorCode(EC, OutputFilename); + + if (auto E = Writer.write(Output)) + exitWithError(std::move(E)); + + return 0; +} + +static int show_main(int argc, const char *argv[]) { + if (Filename == OutputFilename) { + errs() << sys::path::filename(argv[0]) << " " << argv[1] + << ": Input file name cannot be the same as the output file name!\n"; + return 1; + } + + std::error_code EC; + raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF); + if (EC) + exitWithErrorCode(EC, OutputFilename); + + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = CodeGenDataReader::create(Filename, *FS); + if (Error E = ReaderOrErr.takeError()) + exitWithError(std::move(E), Filename); + + auto Reader = ReaderOrErr->get(); + if (ShowCGDataVersion) + OS << "Version: " << Reader->getVersion() << "\n"; + + if (Reader->hasOutlinedHashTree()) { + auto Tree = Reader->releaseOutlinedHashTree(); + OS << "Outlined hash tree:\n"; + OS << " Total Node Count: " << Tree->size() << "\n"; + OS << " Terminal Node Count: " << Tree->size(/*GetTerminalCountOnly=*/true) + << "\n"; + OS << " Depth: " << Tree->depth() << "\n"; + } + + return 0; +} + +int llvm_cgdata_main(int argc, char **argvNonConst, const llvm::ToolContext &) { + const char **argv = const_cast(argvNonConst); + + StringRef ProgName(sys::path::filename(argv[0])); + + if (argc < 2) { + errs() << ProgName + << ": No subcommand specified! Run llvm-cgdata --help for usage.\n"; + return 1; + } + + cl::ParseCommandLineOptions(argc, argv, "LLVM codegen data\n"); + + if (DumpSubcommand) + return dump_main(argc, argv); + + if (MergeSubcommand) + return merge_main(argc, argv); + + if (ShowSubcommand) + return show_main(argc, argv); + + errs() << ProgName + << ": Unknown command. Run llvm-cgdata --help for usage.\n"; + return 1; +}