From 2e7a9099303b227ed8322996f3ba347d8eaebef5 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Thu, 19 Sep 2024 12:45:52 -0400 Subject: [PATCH] Transition to JITLink Seeing what this will look like, since it has a number of features (delayed compilation, concurrent compilation) that are starting to become important, so it would be nice to switch to only supporting one common implementation of memory management. Refs #50248 I am expecting https://github.com/llvm/llvm-project/issues/63236 may cause problems, since we reconfigured some CI machines to hide that issue, but it is still likely relevant. --- src/Makefile | 2 +- src/cgmemmgr.cpp | 965 ---------------------------------------------- src/codegen.cpp | 14 +- src/jitlayers.cpp | 400 ++++--------------- src/jitlayers.h | 64 +-- 5 files changed, 77 insertions(+), 1368 deletions(-) delete mode 100644 src/cgmemmgr.cpp diff --git a/src/Makefile b/src/Makefile index a6b1f433b73ce..f25d37bb13f60 100644 --- a/src/Makefile +++ b/src/Makefile @@ -55,7 +55,7 @@ ifeq ($(JULIACODEGEN),LLVM) CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop llvm-muladd \ llvm-final-gc-lowering llvm-pass-helpers llvm-late-gc-lowering llvm-ptls \ llvm-lower-handlers llvm-gc-invariant-verifier llvm-propagate-addrspaces \ - llvm-multiversioning llvm-alloc-opt llvm-alloc-helpers cgmemmgr llvm-remove-addrspaces \ + llvm-multiversioning llvm-alloc-opt llvm-alloc-helpers llvm-remove-addrspaces \ llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures pipeline llvm_api FLAGS += -I$(shell $(LLVM_CONFIG_HOST) --includedir) CG_LLVM_LIBS := all diff --git a/src/cgmemmgr.cpp b/src/cgmemmgr.cpp deleted file mode 100644 index c78e6092ca5db..0000000000000 --- a/src/cgmemmgr.cpp +++ /dev/null @@ -1,965 +0,0 @@ -// This file is a part of Julia. License is MIT: https://julialang.org/license - -#include "llvm-version.h" -#include "platform.h" - -#include -#include "julia.h" -#include "julia_internal.h" - -#ifdef _OS_LINUX_ -# include -# include -# include -#endif -#ifndef _OS_WINDOWS_ -# include -# include -# include -# include -# if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS) -# define MAP_ANONYMOUS MAP_ANON -# endif -#endif -#ifdef _OS_FREEBSD_ -# include -# include -#endif -#ifdef _OS_OPENBSD_ -# include -#endif -#include "julia_assert.h" - -namespace { - -static size_t get_block_size(size_t size) -{ - return (size > jl_page_size * 256 ? LLT_ALIGN(size, jl_page_size) : - jl_page_size * 256); -} - -// Wrapper function to mmap/munmap/mprotect pages... -static void *map_anon_page(size_t size) -{ -#ifdef _OS_WINDOWS_ - char *mem = (char*)VirtualAlloc(NULL, size + jl_page_size, - MEM_COMMIT, PAGE_READWRITE); - assert(mem && "Cannot allocate RW memory"); - mem = (char*)LLT_ALIGN(uintptr_t(mem), jl_page_size); -#else // _OS_WINDOWS_ - void *mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - assert(mem != MAP_FAILED && "Cannot allocate RW memory"); -#endif // _OS_WINDOWS_ - return mem; -} - -static void unmap_page(void *ptr, size_t size) -{ -#ifdef _OS_WINDOWS_ - VirtualFree(ptr, size, MEM_DECOMMIT); -#else // _OS_WINDOWS_ - munmap(ptr, size); -#endif // _OS_WINDOWS_ -} - -#ifdef _OS_WINDOWS_ -enum class Prot : int { - RW = PAGE_READWRITE, - RX = PAGE_EXECUTE, - RO = PAGE_READONLY, - NO = PAGE_NOACCESS -}; - -static void protect_page(void *ptr, size_t size, Prot flags) -{ - DWORD old_prot; - if (!VirtualProtect(ptr, size, (DWORD)flags, &old_prot)) { - jl_safe_printf("Cannot protect page @%p of size %u to 0x%x (err 0x%x)\n", - ptr, (unsigned)size, (unsigned)flags, - (unsigned)GetLastError()); - abort(); - } -} -#else // _OS_WINDOWS_ -enum class Prot : int { - RW = PROT_READ | PROT_WRITE, - RX = PROT_READ | PROT_EXEC, - RO = PROT_READ, - NO = PROT_NONE -}; - -static void protect_page(void *ptr, size_t size, Prot flags) -{ - int ret = mprotect(ptr, size, (int)flags); - if (ret != 0) { - perror(__func__); - abort(); - } -} - -static bool check_fd_or_close(int fd) -{ - if (fd == -1) - return false; - int err = fcntl(fd, F_SETFD, FD_CLOEXEC); - assert(err == 0); - (void)err; // prevent compiler warning - if (fchmod(fd, S_IRWXU) != 0 || - ftruncate(fd, jl_page_size) != 0) { - close(fd); - return false; - } - // This can fail due to `noexec` mount option .... - void *ptr = mmap(nullptr, jl_page_size, PROT_READ | PROT_EXEC, - MAP_SHARED, fd, 0); - if (ptr == MAP_FAILED) { - close(fd); - return false; - } - munmap(ptr, jl_page_size); - return true; -} -#endif // _OS_WINDOWS_ - -static intptr_t anon_hdl = -1; - -#ifdef _OS_WINDOWS_ -// As far as I can tell `CreateFileMapping` cannot be resized on windows. -// Also, creating big file mapping and then map pieces of it seems to -// consume too much global resources. Therefore, we use each file mapping -// as a block on windows -static void *create_shared_map(size_t size, size_t id) -{ - void *addr = MapViewOfFile((HANDLE)id, FILE_MAP_ALL_ACCESS, - 0, 0, size); - assert(addr && "Cannot map RW view"); - return addr; -} - -static intptr_t init_shared_map() -{ - anon_hdl = 0; - return 0; -} - -static void *alloc_shared_page(size_t size, size_t *id, bool exec) -{ - assert(size % jl_page_size == 0); - DWORD file_mode = exec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE; - HANDLE hdl = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, - file_mode, 0, size, NULL); - *id = (size_t)hdl; - // We set the maximum permissions for this to the maximum for this file, and then - // VirtualProtect, such that the debugger can still access these - // pages and set breakpoints if it wants to. - DWORD map_mode = FILE_MAP_ALL_ACCESS | (exec ? FILE_MAP_EXECUTE : 0); - void *addr = MapViewOfFile(hdl, map_mode, 0, 0, size); - assert(addr && "Cannot map RO view"); - DWORD protect_mode = exec ? PAGE_EXECUTE_READ : PAGE_READONLY; - VirtualProtect(addr, size, protect_mode, &file_mode); - return addr; -} -#else // _OS_WINDOWS_ -// For shared mapped region -static intptr_t get_anon_hdl(void) -{ - int fd = -1; - - // Linux and FreeBSD can create an anonymous fd without touching the - // file system. -# ifdef __NR_memfd_create - fd = syscall(__NR_memfd_create, "julia-codegen", 0); - if (check_fd_or_close(fd)) - return fd; -# endif -# ifdef _OS_FREEBSD_ - fd = shm_open(SHM_ANON, O_RDWR, S_IRWXU); - if (check_fd_or_close(fd)) - return fd; -# endif - char shm_name[JL_PATH_MAX] = "julia-codegen-0123456789-0123456789/tmp///"; - pid_t pid = getpid(); - // `shm_open` can't be mapped exec on mac -# ifndef _OS_DARWIN_ - do { - snprintf(shm_name, sizeof(shm_name), - "julia-codegen-%d-%d", (int)pid, rand()); - fd = shm_open(shm_name, O_RDWR | O_CREAT | O_EXCL, S_IRWXU); - if (check_fd_or_close(fd)) { - shm_unlink(shm_name); - return fd; - } - } while (errno == EEXIST); -# endif - FILE *tmpf = tmpfile(); - if (tmpf) { - fd = dup(fileno(tmpf)); - fclose(tmpf); - if (check_fd_or_close(fd)) { - return fd; - } - } - size_t len = sizeof(shm_name); - if (uv_os_tmpdir(shm_name, &len) != 0) { - // Unknown error; default to `/tmp` - snprintf(shm_name, sizeof(shm_name), "/tmp"); - len = 4; - } - snprintf(shm_name + len, sizeof(shm_name) - len, - "/julia-codegen-%d-XXXXXX", (int)pid); - fd = mkstemp(shm_name); - if (check_fd_or_close(fd)) { - unlink(shm_name); - return fd; - } - return -1; -} - -static _Atomic(size_t) map_offset{0}; -// Multiple of 128MB. -// Hopefully no one will set a ulimit for this to be a problem... -static constexpr size_t map_size_inc_default = 128 * 1024 * 1024; -static size_t map_size = 0; -static struct _make_shared_map_lock { - uv_mutex_t mtx; - _make_shared_map_lock() { - uv_mutex_init(&mtx); - }; -} shared_map_lock; - -static size_t get_map_size_inc() -{ - rlimit rl; - if (getrlimit(RLIMIT_FSIZE, &rl) != -1) { - if (rl.rlim_cur != RLIM_INFINITY) { - return std::min(map_size_inc_default, rl.rlim_cur); - } - if (rl.rlim_max != RLIM_INFINITY) { - return std::min(map_size_inc_default, rl.rlim_max); - } - } - return map_size_inc_default; -} - -static void *create_shared_map(size_t size, size_t id) -{ - void *addr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, - anon_hdl, id); - assert(addr != MAP_FAILED && "Cannot map RW view"); - return addr; -} - -static intptr_t init_shared_map() -{ - anon_hdl = get_anon_hdl(); - if (anon_hdl == -1) - return -1; - jl_atomic_store_relaxed(&map_offset, 0); - map_size = get_map_size_inc(); - int ret = ftruncate(anon_hdl, map_size); - if (ret != 0) { - perror(__func__); - abort(); - } - return anon_hdl; -} - -static void *alloc_shared_page(size_t size, size_t *id, bool exec) -{ - assert(size % jl_page_size == 0); - size_t off = jl_atomic_fetch_add(&map_offset, size); - *id = off; - size_t map_size_inc = get_map_size_inc(); - if (__unlikely(off + size > map_size)) { - uv_mutex_lock(&shared_map_lock.mtx); - size_t old_size = map_size; - while (off + size > map_size) - map_size += map_size_inc; - if (old_size != map_size) { - int ret = ftruncate(anon_hdl, map_size); - if (ret != 0) { - perror(__func__); - abort(); - } - } - uv_mutex_unlock(&shared_map_lock.mtx); - } - return create_shared_map(size, off); -} -#endif // _OS_WINDOWS_ - -#ifdef _OS_LINUX_ -// Using `/proc/self/mem`, A.K.A. Keno's remote memory manager. - -ssize_t pwrite_addr(int fd, const void *buf, size_t nbyte, uintptr_t addr) -{ - static_assert(sizeof(off_t) >= 8, "off_t is smaller than 64bits"); -#ifdef _P64 - const uintptr_t sign_bit = uintptr_t(1) << 63; - if (__unlikely(sign_bit & addr)) { - // This case should not happen with default kernel on 64bit since the address belongs - // to kernel space (linear mapping). - // However, it seems possible to change this at kernel compile time. - - // pwrite doesn't support offset with sign bit set but lseek does. - // This is obviously not thread-safe but none of the mem manager does anyway... - // From the kernel code, `lseek` with `SEEK_SET` can't fail. - // However, this can possibly confuse the glibc wrapper to think that - // we have invalid input value. Use syscall directly to be sure. - syscall(SYS_lseek, (long)fd, addr, (long)SEEK_SET); - // The return value can be -1 when the glibc syscall function - // think we have an error return with and `addr` that's too large. - // Ignore the return value for now. - return write(fd, buf, nbyte); - } -#endif - return pwrite(fd, buf, nbyte, (off_t)addr); -} - -// Do not call this directly. -// Use `get_self_mem_fd` which has a guard to call this only once. -static int _init_self_mem() -{ - struct utsname kernel; - uname(&kernel); - int major, minor; - if (-1 == sscanf(kernel.release, "%d.%d", &major, &minor)) - return -1; - // Can't risk getting a memory block backed by transparent huge pages, - // which cause the kernel to freeze on systems that have the DirtyCOW - // mitigation patch, but are < 4.10. - if (!(major > 4 || (major == 4 && minor >= 10))) - return -1; -#ifdef O_CLOEXEC - int fd = open("/proc/self/mem", O_RDWR | O_SYNC | O_CLOEXEC); - if (fd == -1) - return -1; -#else - int fd = open("/proc/self/mem", O_RDWR | O_SYNC); - if (fd == -1) - return -1; - fcntl(fd, F_SETFD, FD_CLOEXEC); -#endif - - // Check if we can write to a RX page - void *test_pg = mmap(nullptr, jl_page_size, PROT_READ | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - // We can ignore this though failure to allocate executable memory would be a bigger problem. - assert(test_pg != MAP_FAILED && "Cannot allocate executable memory"); - - const uint64_t v = 0xffff000012345678u; - int ret = pwrite_addr(fd, (const void*)&v, sizeof(uint64_t), (uintptr_t)test_pg); - if (ret != sizeof(uint64_t) || *(volatile uint64_t*)test_pg != v) { - munmap(test_pg, jl_page_size); - close(fd); - return -1; - } - munmap(test_pg, jl_page_size); - return fd; -} - -static int get_self_mem_fd() -{ - static int fd = _init_self_mem(); - return fd; -} - -static void write_self_mem(void *dest, void *ptr, size_t size) -{ - while (size > 0) { - ssize_t ret = pwrite_addr(get_self_mem_fd(), ptr, size, (uintptr_t)dest); - if ((size_t)ret == size) - return; - if (ret == -1 && (errno == EAGAIN || errno == EINTR)) - continue; - assert((size_t)ret < size); - size -= ret; - ptr = (char*)ptr + ret; - dest = (char*)dest + ret; - } -} -#endif // _OS_LINUX_ - -using namespace llvm; - -// Allocation strategies -// * For RW data, no memory protection needed, use plain memory pool. -// * For RO data or code, -// -// The first allocation in the page always has write address equals to -// runtime address. -// -// 1. shared dual map -// -// Map an (unlinked) anonymous file as memory pool. -// After first allocation, write address points to the second map. -// The second map is set to unreadable and unwritable in finalization. -// -// 2. private dual map -// -// Same as above but use anonymous memory map as memory pool, -// and use low level OS api to set up the second map. -// -// 3. copying data into RO page bypassing page protection -// -// After first allocation, write address points to a temporary buffer. -// Requires copying data out of the temporary buffer in finalization. - -// Allocates at least 256 pages per block and keep up to 8 blocks in the free -// list. The block with the least free space is discarded when we need to -// allocate a new page. -// Unused full pages are free'd from the block before discarding so at most -// one page is wasted on each discarded blocks. There should be at most one -// block with more than 128 pages available so the discarded one must have -// less than 128 pages available and therefore at least 128 pages used. -// (Apart from fragmentation) this guarantees less than 1% of memory is wasted. - -// the `shared` type parameter is for Windows only.... -struct Block { - // runtime address - char *ptr{nullptr}; - size_t total{0}; - size_t avail{0}; - - Block(const Block&) = delete; - Block &operator=(const Block&) = delete; - Block(Block &&other) - : ptr(other.ptr), - total(other.total), - avail(other.avail) - { - other.ptr = nullptr; - other.total = other.avail = 0; - } - - Block() = default; - - void *alloc(size_t size, size_t align) - { - size_t aligned_avail = avail & (-align); - if (aligned_avail < size) - return nullptr; - char *p = ptr + total - aligned_avail; - avail = aligned_avail - size; - return p; - } - void reset(void *addr, size_t size) - { - if (avail >= jl_page_size) { - uintptr_t end = uintptr_t(ptr) + total; - uintptr_t first_free = end - avail; - first_free = LLT_ALIGN(first_free, jl_page_size); - assert(first_free < end); - unmap_page((void*)first_free, end - first_free); - } - ptr = (char*)addr; - total = avail = size; - } -}; - -class RWAllocator { - static constexpr int nblocks = 8; - Block blocks[nblocks]{}; -public: - void *alloc(size_t size, size_t align) - { - size_t min_size = (size_t)-1; - int min_id = 0; - for (int i = 0;i < nblocks && blocks[i].ptr;i++) { - if (void *ptr = blocks[i].alloc(size, align)) - return ptr; - if (blocks[i].avail < min_size) { - min_size = blocks[i].avail; - min_id = i; - } - } - size_t block_size = get_block_size(size); - blocks[min_id].reset(map_anon_page(block_size), block_size); - return blocks[min_id].alloc(size, align); - } -}; - -struct SplitPtrBlock : public Block { - // Possible states - // Allocation: - // * Initial allocation: `state & InitAlloc` - // * Followup allocation: `(state & Alloc) && !(state & InitAlloc)` - enum State { - // This block has no page protection set yet - InitAlloc = (1 << 0), - // There is at least one allocation in this page since last finalization - Alloc = (1 << 1), - // `wr_ptr` can be directly used as write address. - WRInit = (1 << 2), - // With `WRInit` set, whether `wr_ptr` has write permission enabled. - WRReady = (1 << 3), - }; - - uintptr_t wr_ptr{0}; - uint32_t state{0}; - SplitPtrBlock() = default; - - void swap(SplitPtrBlock &other) - { - std::swap(ptr, other.ptr); - std::swap(total, other.total); - std::swap(avail, other.avail); - std::swap(wr_ptr, other.wr_ptr); - std::swap(state, other.state); - } - - SplitPtrBlock(SplitPtrBlock &&other) - : SplitPtrBlock() - { - swap(other); - } -}; - -struct Allocation { - // Address to write to (the one returned by the allocation function) - void *wr_addr; - // Runtime address - void *rt_addr; - size_t sz; - bool relocated; -}; - -template -class ROAllocator { -protected: - static constexpr int nblocks = 8; - SplitPtrBlock blocks[nblocks]; - // Blocks that are done allocating (removed from `blocks`) - // but might not have all the permissions set or data copied yet. - SmallVector completed; - virtual void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, - size_t size, size_t align) = 0; - virtual SplitPtrBlock alloc_block(size_t size) = 0; -public: - virtual ~ROAllocator() {} - virtual void finalize() - { - for (auto &alloc: allocations) { - // ensure the mapped pages are consistent - sys::Memory::InvalidateInstructionCache(alloc.wr_addr, - alloc.sz); - sys::Memory::InvalidateInstructionCache(alloc.rt_addr, - alloc.sz); - } - completed.clear(); - allocations.clear(); - } - // Allocations that have not been finalized yet. - SmallVector allocations; - void *alloc(size_t size, size_t align) - { - size_t min_size = (size_t)-1; - int min_id = 0; - for (int i = 0;i < nblocks && blocks[i].ptr;i++) { - auto &block = blocks[i]; - void *ptr = block.alloc(size, align); - if (ptr) { - void *wr_ptr; - if (block.state & SplitPtrBlock::InitAlloc) { - wr_ptr = ptr; - } - else { - wr_ptr = get_wr_ptr(block, ptr, size, align); - } - block.state |= SplitPtrBlock::Alloc; - allocations.push_back(Allocation{wr_ptr, ptr, size, false}); - return wr_ptr; - } - if (block.avail < min_size) { - min_size = block.avail; - min_id = i; - } - } - size_t block_size = get_block_size(size); - auto &block = blocks[min_id]; - auto new_block = alloc_block(block_size); - block.swap(new_block); - if (new_block.state) { - completed.push_back(std::move(new_block)); - } - else { - new_block.reset(nullptr, 0); - } - void *ptr = block.alloc(size, align); -#ifdef _OS_WINDOWS_ - block.state = SplitPtrBlock::Alloc; - void *wr_ptr = get_wr_ptr(block, ptr, size, align); - allocations.push_back(Allocation{wr_ptr, ptr, size, false}); - ptr = wr_ptr; -#else - block.state = SplitPtrBlock::Alloc | SplitPtrBlock::InitAlloc; - allocations.push_back(Allocation{ptr, ptr, size, false}); -#endif - return ptr; - } -}; - -template -class DualMapAllocator : public ROAllocator { -protected: - void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, size_t, size_t) override - { - assert((char*)rt_ptr >= block.ptr && - (char*)rt_ptr < (block.ptr + block.total)); - if (!(block.state & SplitPtrBlock::WRInit)) { - block.wr_ptr = (uintptr_t)create_shared_map(block.total, - block.wr_ptr); - block.state |= SplitPtrBlock::WRInit; - } - if (!(block.state & SplitPtrBlock::WRReady)) { - protect_page((void*)block.wr_ptr, block.total, Prot::RW); - block.state |= SplitPtrBlock::WRReady; - } - return (char*)rt_ptr + (block.wr_ptr - uintptr_t(block.ptr)); - } - SplitPtrBlock alloc_block(size_t size) override - { - SplitPtrBlock new_block; - // use `wr_ptr` to record the id initially - auto ptr = alloc_shared_page(size, (size_t*)&new_block.wr_ptr, exec); - new_block.reset(ptr, size); - return new_block; - } - void finalize_block(SplitPtrBlock &block, bool reset) - { - // This function handles setting the block to the right mode - // and free'ing maps that are not needed anymore. - // If `reset` is `true`, we won't allocate in this block anymore and - // we should free up resources that is not needed at runtime. - if (!(block.state & SplitPtrBlock::Alloc)) { - // A block that is not used this time, check if we need to free it. - if ((block.state & SplitPtrBlock::WRInit) && reset) - unmap_page((void*)block.wr_ptr, block.total); - return; - } - // For a block we used this time - if (block.state & SplitPtrBlock::InitAlloc) { - // For an initial block, we have a single RW map. - // Need to map it to RO or RX. - assert(!(block.state & (SplitPtrBlock::WRReady | - SplitPtrBlock::WRInit))); - protect_page(block.ptr, block.total, exec ? Prot::RX : Prot::RO); - block.state = 0; - } - else { - // For other ones, the runtime address has the correct mode. - // Need to map the write address to RO. - assert(block.state & SplitPtrBlock::WRInit); - assert(block.state & SplitPtrBlock::WRReady); - if (reset) { - unmap_page((void*)block.wr_ptr, block.total); - } - else { - protect_page((void*)block.wr_ptr, block.total, Prot::NO); - block.state = SplitPtrBlock::WRInit; - } - } - } -public: - DualMapAllocator() - { - assert(anon_hdl != -1); - } - void finalize() override - { - for (auto &block : this->blocks) { - finalize_block(block, false); - } - for (auto &block : this->completed) { - finalize_block(block, true); - block.reset(nullptr, 0); - } - ROAllocator::finalize(); - } -}; - -#ifdef _OS_LINUX_ -template -class SelfMemAllocator : public ROAllocator { - SmallVector temp_buff; -protected: - void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, - size_t size, size_t align) override - { - assert(!(block.state & SplitPtrBlock::InitAlloc)); - for (auto &wr_block: temp_buff) { - if (void *ptr = wr_block.alloc(size, align)) { - return ptr; - } - } - temp_buff.emplace_back(); - Block &new_block = temp_buff.back(); - size_t block_size = get_block_size(size); - new_block.reset(map_anon_page(block_size), block_size); - return new_block.alloc(size, align); - } - SplitPtrBlock alloc_block(size_t size) override - { - SplitPtrBlock new_block; - new_block.reset(map_anon_page(size), size); - return new_block; - } - void finalize_block(SplitPtrBlock &block, bool reset) - { - if (!(block.state & SplitPtrBlock::Alloc)) - return; - if (block.state & SplitPtrBlock::InitAlloc) { - // for an initial block, we need to map it to ro or rx - assert(!(block.state & (SplitPtrBlock::WRReady | - SplitPtrBlock::WRInit))); - protect_page(block.ptr, block.total, exec ? Prot::RX : Prot::RO); - block.state = 0; - } - } -public: - SelfMemAllocator() - : ROAllocator(), - temp_buff() - { - assert(get_self_mem_fd() != -1); - } - void finalize() override - { - for (auto &block : this->blocks) { - finalize_block(block, false); - } - for (auto &block : this->completed) { - finalize_block(block, true); - block.reset(nullptr, 0); - } - for (auto &alloc : this->allocations) { - if (alloc.rt_addr == alloc.wr_addr) - continue; - write_self_mem(alloc.rt_addr, alloc.wr_addr, alloc.sz); - } - // clear all the temp buffers except the first one - // (we expect only one) - bool cached = false; - for (auto &block : temp_buff) { - if (cached) { - munmap(block.ptr, block.total); - block.ptr = nullptr; - block.total = block.avail = 0; - } - else { - block.avail = block.total; - cached = true; - } - } - if (cached) - temp_buff.resize(1); - ROAllocator::finalize(); - } -}; -#endif // _OS_LINUX_ - -class RTDyldMemoryManagerJL : public SectionMemoryManager { - struct EHFrame { - uint8_t *addr; - size_t size; - }; - RTDyldMemoryManagerJL(const RTDyldMemoryManagerJL&) = delete; - void operator=(const RTDyldMemoryManagerJL&) = delete; - SmallVector pending_eh; - RWAllocator rw_alloc; - std::unique_ptr> ro_alloc; - std::unique_ptr> exe_alloc; - bool code_allocated; - size_t total_allocated; - -public: - RTDyldMemoryManagerJL() - : SectionMemoryManager(), - pending_eh(), - rw_alloc(), - ro_alloc(), - exe_alloc(), - code_allocated(false), - total_allocated(0) - { -#ifdef _OS_LINUX_ - if (!ro_alloc && get_self_mem_fd() != -1) { - ro_alloc.reset(new SelfMemAllocator()); - exe_alloc.reset(new SelfMemAllocator()); - } -#endif - if (!ro_alloc && init_shared_map() != -1) { - ro_alloc.reset(new DualMapAllocator()); - exe_alloc.reset(new DualMapAllocator()); - } - } - ~RTDyldMemoryManagerJL() override - { - } - size_t getTotalBytes() { return total_allocated; } - void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, - size_t Size) override; -#if 0 - // Disable for now since we are not actually using this. - void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, - size_t Size) override; -#endif - uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, - unsigned SectionID, - StringRef SectionName) override; - uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, - unsigned SectionID, StringRef SectionName, - bool isReadOnly) override; - using SectionMemoryManager::notifyObjectLoaded; - void notifyObjectLoaded(RuntimeDyld &Dyld, - const object::ObjectFile &Obj) override; - bool finalizeMemory(std::string *ErrMsg = nullptr) override; - template - void mapAddresses(DL &Dyld, Alloc &&allocator) - { - for (auto &alloc: allocator->allocations) { - if (alloc.rt_addr == alloc.wr_addr || alloc.relocated) - continue; - alloc.relocated = true; - Dyld.mapSectionAddress(alloc.wr_addr, (uintptr_t)alloc.rt_addr); - } - } - template - void mapAddresses(DL &Dyld) - { - if (!ro_alloc) - return; - mapAddresses(Dyld, ro_alloc); - mapAddresses(Dyld, exe_alloc); - } -#ifdef _OS_WINDOWS_ - template - void *lookupWriteAddressFor(void *rt_addr, Alloc &&allocator) - { - for (auto &alloc: allocator->allocations) { - if (alloc.rt_addr == rt_addr) { - return alloc.wr_addr; - } - } - return nullptr; - } - void *lookupWriteAddressFor(void *rt_addr) - { - if (!ro_alloc) - return rt_addr; - if (void *ptr = lookupWriteAddressFor(rt_addr, ro_alloc)) - return ptr; - if (void *ptr = lookupWriteAddressFor(rt_addr, exe_alloc)) - return ptr; - return rt_addr; - } -#endif // _OS_WINDOWS_ -}; - -uint8_t *RTDyldMemoryManagerJL::allocateCodeSection(uintptr_t Size, - unsigned Alignment, - unsigned SectionID, - StringRef SectionName) -{ - // allocating more than one code section can confuse libunwind. -#if !defined(_COMPILER_MSAN_ENABLED_) && !defined(_COMPILER_ASAN_ENABLED_) - // TODO: Figure out why msan and now asan too need this. - assert(!code_allocated); - code_allocated = true; -#endif - total_allocated += Size; - jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size); - jl_timing_counter_inc(JL_TIMING_COUNTER_JITCodeSize, Size); - if (exe_alloc) - return (uint8_t*)exe_alloc->alloc(Size, Alignment); - return SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID, - SectionName); -} - -uint8_t *RTDyldMemoryManagerJL::allocateDataSection(uintptr_t Size, - unsigned Alignment, - unsigned SectionID, - StringRef SectionName, - bool isReadOnly) -{ - total_allocated += Size; - jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size); - jl_timing_counter_inc(JL_TIMING_COUNTER_JITDataSize, Size); - if (!isReadOnly) - return (uint8_t*)rw_alloc.alloc(Size, Alignment); - if (ro_alloc) - return (uint8_t*)ro_alloc->alloc(Size, Alignment); - return SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, - SectionName, isReadOnly); -} - -void RTDyldMemoryManagerJL::notifyObjectLoaded(RuntimeDyld &Dyld, - const object::ObjectFile &Obj) -{ - if (!ro_alloc) { - assert(!exe_alloc); - SectionMemoryManager::notifyObjectLoaded(Dyld, Obj); - return; - } - assert(exe_alloc); - mapAddresses(Dyld); -} - -bool RTDyldMemoryManagerJL::finalizeMemory(std::string *ErrMsg) -{ - code_allocated = false; - if (ro_alloc) { - ro_alloc->finalize(); - assert(exe_alloc); - exe_alloc->finalize(); - for (auto &frame: pending_eh) - register_eh_frames(frame.addr, frame.size); - pending_eh.clear(); - return false; - } - else { - assert(!exe_alloc); - return SectionMemoryManager::finalizeMemory(ErrMsg); - } -} - -void RTDyldMemoryManagerJL::registerEHFrames(uint8_t *Addr, - uint64_t LoadAddr, - size_t Size) -{ - if (uintptr_t(Addr) == LoadAddr) { - register_eh_frames(Addr, Size); - } - else { - pending_eh.push_back(EHFrame{(uint8_t*)(uintptr_t)LoadAddr, Size}); - } -} - -#if 0 -void RTDyldMemoryManagerJL::deregisterEHFrames(uint8_t *Addr, - uint64_t LoadAddr, - size_t Size) -{ - deregister_eh_frames((uint8_t*)LoadAddr, Size); -} -#endif - -} - -#ifdef _OS_WINDOWS_ -void *lookupWriteAddressFor(RTDyldMemoryManager *memmgr, void *rt_addr) -{ - return ((RTDyldMemoryManagerJL*)memmgr)->lookupWriteAddressFor(rt_addr); -} -#endif - -RTDyldMemoryManager* createRTDyldMemoryManager() -{ - return new RTDyldMemoryManagerJL(); -} - -size_t getRTDyldMemoryManagerTotalBytes(RTDyldMemoryManager *mm) -{ - return ((RTDyldMemoryManagerJL*)mm)->getTotalBytes(); -} diff --git a/src/codegen.cpp b/src/codegen.cpp index a7a985284c87b..cee88a3d20f3d 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -10396,7 +10396,7 @@ static void init_jit_functions(void) } #ifdef JL_USE_INTEL_JITEVENTS -char jl_using_intel_jitevents; // Non-zero if running under Intel VTune Amplifier +char jl_using_intel_jitevents = 0; // Non-zero if running under Intel VTune Amplifier #endif #ifdef JL_USE_OPROFILE_JITEVENTS @@ -10510,9 +10510,6 @@ extern "C" void jl_init_llvm(void) #if defined(JL_USE_INTEL_JITEVENTS) || \ defined(JL_USE_OPROFILE_JITEVENTS) || \ defined(JL_USE_PERF_JITEVENTS) -#ifdef JL_USE_JITLINK -#pragma message("JIT profiling support (JL_USE_*_JITEVENTS) not yet available on platforms that use JITLink") -#else const char *jit_profiling = getenv("ENABLE_JITPROFILING"); #if defined(JL_USE_INTEL_JITEVENTS) @@ -10529,24 +10526,23 @@ extern "C" void jl_init_llvm(void) #if defined(JL_USE_PERF_JITEVENTS) if (jit_profiling && atoi(jit_profiling)) { - jl_using_perf_jitevents= 1; + jl_using_perf_jitevents = 1; } #endif #ifdef JL_USE_INTEL_JITEVENTS if (jl_using_intel_jitevents) - jl_ExecutionEngine->RegisterJITEventListener(JITEventListener::createIntelJITEventListener()); + jl_ExecutionEngine->enableIntelJITEventListener(); #endif #ifdef JL_USE_OPROFILE_JITEVENTS if (jl_using_oprofile_jitevents) - jl_ExecutionEngine->RegisterJITEventListener(JITEventListener::createOProfileJITEventListener()); + jl_ExecutionEngine->enableOProfileJITEventListener(); #endif #ifdef JL_USE_PERF_JITEVENTS if (jl_using_perf_jitevents) - jl_ExecutionEngine->RegisterJITEventListener(JITEventListener::createPerfJITEventListener()); -#endif + jl_ExecutionEngine->enablePerfJITEventListener(); #endif #endif diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 442103c91be0f..a70988aa36d89 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -14,6 +14,15 @@ #include #include #include +#if JL_LLVM_VERSION >= 180000 +#include +#include +#include +#endif +#if JL_LLVM_VERSION >= 190000 +#include +#include +#endif #include #include #include @@ -142,11 +151,11 @@ void jl_dump_llvm_opt_impl(void *s) **jl_ExecutionEngine->get_dump_llvm_opt_stream() = (ios_t*)s; } -static int jl_add_to_ee( - orc::ThreadSafeModule &M, - const StringMap &NewExports, - DenseMap &Queued, - SmallVectorImpl &Stack) JL_NOTSAFEPOINT; +//static int jl_add_to_ee( +// orc::ThreadSafeModule &M, +// const StringMap &NewExports, +// DenseMap &Queued, +// SmallVectorImpl &Stack) JL_NOTSAFEPOINT; static void jl_decorate_module(Module &M) JL_NOTSAFEPOINT; static uint64_t getAddressForFunction(StringRef fname) JL_NOTSAFEPOINT; @@ -239,7 +248,8 @@ static jl_callptr_t _jl_compile_codeinst( if (params.imaging_mode) { // Won't contain any PLT/dlsym calls, so no need to optimize those jl_ExecutionEngine->addModule(jl_get_globals_module(params.tsctx, params.DL, params.TargetTriple, params.global_targets)); - } else { + } + else { StringMap NewGlobals; for (auto &global : params.global_targets) { NewGlobals[global.second->getName()] = global.first; @@ -255,31 +265,10 @@ static jl_callptr_t _jl_compile_codeinst( } } - // Collect the exported functions from the params.compiled_functions modules, - // which form dependencies on which functions need to be - // compiled first. Cycles of functions are compiled together. - // (essentially we compile a DAG of SCCs in reverse topological order, - // if we treat declarations of external functions as edges from declaration - // to definition) - StringMap NewExports; - for (auto &def : params.compiled_functions) { - orc::ThreadSafeModule &TSM = std::get<0>(def.second); - //The underlying context object is still locked because params is not destroyed yet - auto M = TSM.getModuleUnlocked(); - jl_ExecutionEngine->optimizeDLSyms(*M); - for (auto &F : M->global_objects()) { - if (!F.isDeclaration() && F.getLinkage() == GlobalValue::ExternalLinkage) { - NewExports[F.getName()] = &TSM; - } - } - } - DenseMap Queued; - SmallVector Stack; for (auto &def : params.compiled_functions) { // Add the results to the execution engine now orc::ThreadSafeModule &M = std::get<0>(def.second); - jl_add_to_ee(M, NewExports, Queued, Stack); - assert(Queued.empty() && Stack.empty() && !M); + jl_ExecutionEngine->addModule(std::move(M)); } ++CompiledCodeinsts; MaxWorkqueueSize.updateMax(params.compiled_functions.size()); @@ -601,48 +590,6 @@ static auto countBasicBlocks(const Function &F) JL_NOTSAFEPOINT static constexpr size_t N_optlevels = 4; -static Expected validateExternRelocations(orc::ThreadSafeModule TSM, orc::MaterializationResponsibility &R) JL_NOTSAFEPOINT { -#if !defined(JL_NDEBUG) && !defined(JL_USE_JITLINK) - auto isIntrinsicFunction = [](GlobalObject &GO) JL_NOTSAFEPOINT { - auto F = dyn_cast(&GO); - if (!F) - return false; - return F->isIntrinsic() || F->getName().starts_with("julia."); - }; - // validate the relocations for M (only for RuntimeDyld, JITLink performs its own symbol validation) - auto Err = TSM.withModuleDo([isIntrinsicFunction](Module &M) JL_NOTSAFEPOINT { - Error Err = Error::success(); - for (auto &GO : make_early_inc_range(M.global_objects())) { - if (!GO.isDeclarationForLinker()) - continue; - if (GO.use_empty()) { - GO.eraseFromParent(); - continue; - } - if (isIntrinsicFunction(GO)) - continue; - auto sym = jl_ExecutionEngine->findUnmangledSymbol(GO.getName()); - if (sym) - continue; - // TODO have we ever run into this check? It's been guaranteed to not - // fire in an assert build, since previously LLVM would abort due to - // not handling the error if we didn't find the unmangled symbol - if (SectionMemoryManager::getSymbolAddressInProcess( - jl_ExecutionEngine->getMangledName(GO.getName()))) { - consumeError(sym.takeError()); - continue; - } - Err = joinErrors(std::move(Err), sym.takeError()); - } - return Err; - }); - if (Err) { - return std::move(Err); - } -#endif - return std::move(TSM); -} - static Expected selectOptLevel(orc::ThreadSafeModule TSM, orc::MaterializationResponsibility &R) { TSM.withModuleDo([](Module &M) { size_t opt_level = std::max(static_cast(jl_options.opt_level), 0); @@ -673,18 +620,6 @@ static Expected selectOptLevel(orc::ThreadSafeModule TSM, return std::move(TSM); } -static void recordDebugTSM(orc::MaterializationResponsibility &, orc::ThreadSafeModule TSM) JL_NOTSAFEPOINT { - auto ptr = TSM.withModuleDo([](Module &M) JL_NOTSAFEPOINT { - auto md = M.getModuleFlag("julia.__jit_debug_tsm_addr"); - if (!md) - return static_cast(nullptr); - return reinterpret_cast(cast(cast(md)->getValue())->getZExtValue()); - }); - if (ptr) { - *ptr = std::move(TSM); - } -} - void jl_register_jit_object(const object::ObjectFile &debugObj, std::function getLoadAddress, std::function lookupWriteAddress); @@ -922,97 +857,6 @@ class JLEHFrameRegistrar final : public jitlink::EHFrameRegistrar { } }; -RTDyldMemoryManager* createRTDyldMemoryManager(void); - -// A simple forwarding class, since OrcJIT v2 needs a unique_ptr, while we have a shared_ptr -class ForwardingMemoryManager : public RuntimeDyld::MemoryManager { -private: - std::shared_ptr MemMgr; - -public: - ForwardingMemoryManager(std::shared_ptr MemMgr) : MemMgr(MemMgr) {} - virtual ~ForwardingMemoryManager() = default; - virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, - unsigned SectionID, - StringRef SectionName) override { - return MemMgr->allocateCodeSection(Size, Alignment, SectionID, SectionName); - } - virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, - unsigned SectionID, - StringRef SectionName, - bool IsReadOnly) override { - return MemMgr->allocateDataSection(Size, Alignment, SectionID, SectionName, IsReadOnly); - } -#if JL_LLVM_VERSION >= 160000 - virtual void reserveAllocationSpace(uintptr_t CodeSize, Align CodeAlign, - uintptr_t RODataSize, Align RODataAlign, - uintptr_t RWDataSize, Align RWDataAlign) override { - return MemMgr->reserveAllocationSpace(CodeSize, CodeAlign, RODataSize, RODataAlign, RWDataSize, RWDataAlign); - } -#else - virtual void reserveAllocationSpace(uintptr_t CodeSize, uint32_t CodeAlign, - uintptr_t RODataSize, - uint32_t RODataAlign, - uintptr_t RWDataSize, - uint32_t RWDataAlign) override { - return MemMgr->reserveAllocationSpace(CodeSize, CodeAlign, RODataSize, RODataAlign, RWDataSize, RWDataAlign); - } -#endif - virtual bool needsToReserveAllocationSpace() override { - return MemMgr->needsToReserveAllocationSpace(); - } - virtual void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, - size_t Size) override { - return MemMgr->registerEHFrames(Addr, LoadAddr, Size); - } - virtual void deregisterEHFrames() override { - return MemMgr->deregisterEHFrames(); - } - virtual bool finalizeMemory(std::string *ErrMsg = nullptr) override { - return MemMgr->finalizeMemory(ErrMsg); - } - virtual void notifyObjectLoaded(RuntimeDyld &RTDyld, - const object::ObjectFile &Obj) override { - return MemMgr->notifyObjectLoaded(RTDyld, Obj); - } -}; - - -#if defined(_OS_WINDOWS_) && defined(_CPU_X86_64_) -void *lookupWriteAddressFor(RTDyldMemoryManager *MemMgr, void *rt_addr); -#endif - -void registerRTDyldJITObject(const object::ObjectFile &Object, - const RuntimeDyld::LoadedObjectInfo &L, - const std::shared_ptr &MemMgr) -{ - StringMap loadedSections; - for (const object::SectionRef &lSection : Object.sections()) { - auto sName = lSection.getName(); - if (sName) { - bool inserted = loadedSections.insert(std::make_pair(*sName, lSection)).second; - assert(inserted); - (void)inserted; - } - } - auto getLoadAddress = [loadedSections = std::move(loadedSections), - &L](const StringRef &sName) -> uint64_t { - auto search = loadedSections.find(sName); - if (search == loadedSections.end()) - return 0; - return L.getSectionLoadAddress(search->second); - }; - - auto DebugObject = L.getObjectForDebug(Object); // ELF requires us to make a copy to mutate the header with the section load addresses. On other platforms this is a no-op. - jl_register_jit_object(DebugObject.getBinary() ? *DebugObject.getBinary() : Object, - getLoadAddress, -#if defined(_OS_WINDOWS_) && defined(_CPU_X86_64_) - [MemMgr](void *p) { return lookupWriteAddressFor(MemMgr.get(), p); } -#else - nullptr -#endif - ); -} namespace { static std::unique_ptr createTargetMachine() JL_NOTSAFEPOINT { TargetOptions options = TargetOptions(); @@ -1581,30 +1425,15 @@ JuliaOJIT::JuliaOJIT() #endif return orc::ThreadSafeContext(std::move(ctx)); }), -#ifdef JL_USE_JITLINK MemMgr(createJITLinkMemoryManager()), ObjectLayer(ES, *MemMgr), -#else - MemMgr(createRTDyldMemoryManager()), - ObjectLayer( - ES, - [this]() { - std::unique_ptr result(new ForwardingMemoryManager(MemMgr)); - return result; - } - ), -#endif - LockLayer(ObjectLayer), - CompileLayer(ES, LockLayer, std::make_unique>(orc::irManglingOptionsFromTargetOptions(TM->Options), *TM)), + CompileLayer(ES, ObjectLayer, std::make_unique>(orc::irManglingOptionsFromTargetOptions(TM->Options), *TM)), JITPointersLayer(ES, CompileLayer, orc::IRTransformLayer::TransformFunction(JITPointersT(SharedBytes, RLST_mutex))), OptimizeLayer(ES, JITPointersLayer, orc::IRTransformLayer::TransformFunction(OptimizerT(*TM, PrintLLVMTimers, llvm_printing_mutex))), OptSelLayer(ES, OptimizeLayer, orc::IRTransformLayer::TransformFunction(selectOptLevel)), - DepsVerifyLayer(ES, OptSelLayer, orc::IRTransformLayer::TransformFunction(validateExternRelocations)), - ExternalCompileLayer(ES, LockLayer, + ExternalCompileLayer(ES, ObjectLayer, std::make_unique>(orc::irManglingOptionsFromTargetOptions(TM->Options), *TM)) { - JL_MUTEX_INIT(&this->jitlock, "JuliaOJIT"); -#ifdef JL_USE_JITLINK # if defined(LLVM_SHLIB) // When dynamically linking against LLVM, use our custom EH frame registration code // also used with RTDyld to inform both our and the libc copy of libunwind. @@ -1617,15 +1446,6 @@ JuliaOJIT::JuliaOJIT() ObjectLayer.addPlugin(std::make_unique()); ObjectLayer.addPlugin(std::make_unique(jit_bytes_size)); -#else - ObjectLayer.setNotifyLoaded( - [this](orc::MaterializationResponsibility &MR, - const object::ObjectFile &Object, - const RuntimeDyld::LoadedObjectInfo &LO) { - registerRTDyldJITObject(Object, LO, MemMgr); - }); -#endif - CompileLayer.setNotifyCompiled(recordDebugTSM); std::string ErrorStr; @@ -1786,51 +1606,11 @@ void JuliaOJIT::addModule(orc::ThreadSafeModule TSM) { JL_TIMING(LLVM_JIT, JIT_Total); ++ModulesAdded; - orc::SymbolLookupSet NewExports; - orc::ThreadSafeModule CurrentlyCompiling; - TSM.withModuleDo([&](Module &M) JL_NOTSAFEPOINT { - for (auto &F : M.global_values()) { - if (!F.isDeclaration() && F.getLinkage() == GlobalValue::ExternalLinkage) { - auto Name = ES.intern(getMangledName(F.getName())); - NewExports.add(std::move(Name)); - } - } - assert(!verifyLLVMIR(M)); - auto jit_debug_tsm_addr = ConstantInt::get(Type::getIntNTy(M.getContext(), sizeof(void*) * CHAR_BIT), (uintptr_t) &CurrentlyCompiling); - M.addModuleFlag(Module::Error, "julia.__jit_debug_tsm_addr", jit_debug_tsm_addr); - }); - // TODO: what is the performance characteristics of this? - auto Err = DepsVerifyLayer.add(JD, std::move(TSM)); + auto Err = OptSelLayer.add(JD, std::move(TSM)); if (Err) { ES.reportError(std::move(Err)); errs() << "Failed to add module to JIT!\n"; - if (CurrentlyCompiling) { - CurrentlyCompiling.withModuleDo([](Module &M) JL_NOTSAFEPOINT { errs() << "Dumping failing module\n" << M << "\n"; }); - } else { - errs() << "Module unavailable to be printed\n"; - } - abort(); - } - // force eager compilation (for now), due to memory management specifics - // (can't handle compilation recursion) - auto Lookups = ES.lookup({{&JD, orc::JITDylibLookupFlags::MatchExportedSymbolsOnly}}, NewExports); - if (!Lookups) { - ES.reportError(Lookups.takeError()); - errs() << "Failed to lookup symbols in module!\n"; - if (CurrentlyCompiling) { - CurrentlyCompiling.withModuleDo([](Module &M) JL_NOTSAFEPOINT { errs() << "Dumping failing module\n" << M << "\n"; }); - } else { - errs() << "Module unavailable to be printed\n"; - } - } - for (auto &Sym : *Lookups) { - #if JL_LLVM_VERSION >= 170000 - assert(Sym.second.getAddress()); - #else - assert(Sym.second); - #endif - (void) Sym; } } @@ -1855,7 +1635,7 @@ Error JuliaOJIT::addExternalModule(orc::JITDylib &JD, orc::ThreadSafeModule TSM, Error JuliaOJIT::addObjectFile(orc::JITDylib &JD, std::unique_ptr Obj) { assert(Obj && "Can not add null object"); - return LockLayer.add(JD.getDefaultResourceTracker(), std::move(Obj)); + return ObjectLayer.add(JD.getDefaultResourceTracker(), std::move(Obj)); } #if JL_LLVM_VERSION >= 170000 @@ -1973,43 +1753,69 @@ StringRef JuliaOJIT::getFunctionAtAddress(uint64_t Addr, jl_callptr_t invoke, jl return *fname; } - -#ifdef JL_USE_JITLINK -extern "C" orc::shared::CWrapperFunctionResult -llvm_orc_registerJITLoaderGDBAllocAction(const char *Data, size_t Size); +#if JL_LLVM_VERSION >= 170000 +#define addAbsoluteToMap(map,name) \ + (map[mangle(#name)] = {ExecutorAddr::fromPtr(&name), JITSymbolFlags::Exported | JITSymbolFlags::Callable}, orc::ExecutorAddr::fromPtr(&name)) +#else +#define addAbsoluteToMap(map,name) \ + (map[mangle(#name)] = JITEvaluatedSymbol::fromPointer(&name, JITSymbolFlags::Exported | JITSymbolFlags::Callable), orc::ExecutorAddr::fromPtr(&name)) +#endif void JuliaOJIT::enableJITDebuggingSupport() { orc::SymbolMap GDBFunctions; - #if JL_LLVM_VERSION >= 170000 - GDBFunctions[mangle("llvm_orc_registerJITLoaderGDBAllocAction")] = {ExecutorAddr::fromPtr(&llvm_orc_registerJITLoaderGDBAllocAction), JITSymbolFlags::Exported | JITSymbolFlags::Callable}; - GDBFunctions[mangle("llvm_orc_registerJITLoaderGDBWrapper")] = {ExecutorAddr::fromPtr(&llvm_orc_registerJITLoaderGDBWrapper), JITSymbolFlags::Exported | JITSymbolFlags::Callable}; - #else - GDBFunctions[mangle("llvm_orc_registerJITLoaderGDBAllocAction")] = JITEvaluatedSymbol::fromPointer(&llvm_orc_registerJITLoaderGDBAllocAction, JITSymbolFlags::Exported | JITSymbolFlags::Callable); - GDBFunctions[mangle("llvm_orc_registerJITLoaderGDBWrapper")] = JITEvaluatedSymbol::fromPointer(&llvm_orc_registerJITLoaderGDBWrapper, JITSymbolFlags::Exported | JITSymbolFlags::Callable); - #endif + addAbsoluteToMap(GDBFunctions,llvm_orc_registerJITLoaderGDBAllocAction); + auto registerJITLoaderGDBWrapper = addAbsoluteToMap(GDBFunctions,llvm_orc_registerJITLoaderGDBWrapper); cantFail(JD.define(orc::absoluteSymbols(GDBFunctions))); if (TM->getTargetTriple().isOSBinFormatMachO()) ObjectLayer.addPlugin(cantFail(orc::GDBJITDebugInfoRegistrationPlugin::Create(ES, JD, TM->getTargetTriple()))); #ifndef _COMPILER_ASAN_ENABLED_ // TODO: Fix duplicated sections spam #51794 else if (TM->getTargetTriple().isOSBinFormatELF()) //EPCDebugObjectRegistrar doesn't take a JITDylib, so we have to directly provide the call address - ObjectLayer.addPlugin(std::make_unique(ES, std::make_unique(ES, orc::ExecutorAddr::fromPtr(&llvm_orc_registerJITLoaderGDBWrapper)))); + ObjectLayer.addPlugin(std::make_unique(ES, std::make_unique(ES, registerJITLoaderGDBWrapper))); #endif } -#else -void JuliaOJIT::enableJITDebuggingSupport() + +void JuliaOJIT::enableIntelJITEventListener() { - RegisterJITEventListener(JITEventListener::createGDBRegistrationListener()); +#if JL_LLVM_VERSION >= 190000 + if (TT.isOSBinFormatELF()) { + orc::SymbolMap VTuneFunctions; + auto RegisterImplAddr = addAbsoluteToMap(VTuneFunctions,llvm_orc_registerVTuneImpl); + auto UnregisterImplAddr = addAbsoluteToMap(VTuneFunctions,llvm_orc_unregisterVTuneImpl); + ObjectLayer.addPlugin(cantFail(DebugInfoPreservationPlugin::Create())); + //ObjectLayer.addPlugin(cantFail(VTuneSupportPlugin::Create(ES.getExecutorProcessControl(), + // JD, /*EmitDebugInfo=*/true, + // /*TestMode=*/false))); + bool EmitDebugInfo = true; + ObjectLayer.addPlugin(std::make_unique( + ES.getExecutorProcessControl(), RegisterImplAddr, UnregisterImplAddr, EmitDebugInfo)); + } +#endif } -void JuliaOJIT::RegisterJITEventListener(JITEventListener *L) +void JuliaOJIT::enableOProfileJITEventListener() { - if (!L) - return; - this->ObjectLayer.registerJITEventListener(*L); } + +void JuliaOJIT::enablePerfJITEventListener() +{ +#if JL_LLVM_VERSION >= 180000 + orc::SymbolMap PerfFunctions; + auto StartAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfStart); + auto EndAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfEnd); + auto ImplAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfImpl); + cantFail(JD.define(orc::absoluteSymbols(PerfFunctions))); + if (TM->getTargetTriple().isOSBinFormatELF()) { + ObjectLayer.addPlugin(cantFail(DebugInfoPreservationPlugin::Create())); + //ObjectLayer.addPlugin(cantFail(PerfSupportPlugin::Create( + // ES.getExecutorProcessControl(), *JD, true, true))); + bool EmitDebugInfo = true, EmitUnwindInfo = true; + ObjectLayer.addPlugin(std::make_unique( + ES.getExecutorProcessControl(), StartAddr, EndAddr, ImplAddr, EmitDebugInfo, EmitUnwindInfo)); + } #endif +} const DataLayout& JuliaOJIT::getDataLayout() const { @@ -2030,12 +1836,7 @@ std::string JuliaOJIT::getMangledName(const GlobalValue *GV) size_t JuliaOJIT::getTotalBytes() const { - auto bytes = jit_bytes_size.load(std::memory_order_relaxed); -#ifndef JL_USE_JITLINK - size_t getRTDyldMemoryManagerTotalBytes(RTDyldMemoryManager *mm) JL_NOTSAFEPOINT; - bytes += getRTDyldMemoryManagerTotalBytes(MemMgr.get()); -#endif - return bytes; + return jit_bytes_size.load(std::memory_order_relaxed); } void JuliaOJIT::addBytes(size_t bytes) @@ -2043,6 +1844,7 @@ void JuliaOJIT::addBytes(size_t bytes) jit_bytes_size.fetch_add(bytes, std::memory_order_relaxed); } + void JuliaOJIT::printTimers() { for (auto &printer : PrintLLVMTimers) { @@ -2251,72 +2053,6 @@ static void jl_decorate_module(Module &M) { } } -// Implements Tarjan's SCC (strongly connected components) algorithm, simplified to remove the count variable -static int jl_add_to_ee( - orc::ThreadSafeModule &M, - const StringMap &NewExports, - DenseMap &Queued, - SmallVectorImpl &Stack) -{ - // First check if the TSM is empty (already compiled) - if (!M) - return 0; - // Next check and record if it is on the stack somewhere - { - auto &Id = Queued[&M]; - if (Id) - return Id; - Stack.push_back(&M); - Id = Stack.size(); - } - // Finally work out the SCC - int depth = Stack.size(); - int MergeUp = depth; - SmallVector Children; - M.withModuleDo([&](Module &m) JL_NOTSAFEPOINT { - for (auto &F : m.global_objects()) { - if (F.isDeclaration() && F.getLinkage() == GlobalValue::ExternalLinkage) { - auto Callee = NewExports.find(F.getName()); - if (Callee != NewExports.end()) { - auto *CM = Callee->second; - if (*CM && CM != &M) { - auto Down = Queued.find(CM); - if (Down != Queued.end()) - MergeUp = std::min(MergeUp, Down->second); - else - Children.push_back(CM); - } - } - } - } - }); - assert(MergeUp > 0); - for (auto *CM : Children) { - int Down = jl_add_to_ee(*CM, NewExports, Queued, Stack); - assert(Down <= (int)Stack.size()); - if (Down) - MergeUp = std::min(MergeUp, Down); - } - if (MergeUp < depth) - return MergeUp; - while (1) { - // Not in a cycle (or at the top of it) - // remove SCC state and merge every CM from the cycle into M - orc::ThreadSafeModule *CM = Stack.back(); - auto it = Queued.find(CM); - assert(it->second == (int)Stack.size()); - Queued.erase(it); - Stack.pop_back(); - if ((int)Stack.size() < depth) { - assert(&M == CM); - break; - } - jl_merge_module(M, std::move(*CM)); - } - jl_ExecutionEngine->addModule(std::move(M)); - return 0; -} - static uint64_t getAddressForFunction(StringRef fname) { auto addr = jl_ExecutionEngine->getFunctionAddress(fname); diff --git a/src/jitlayers.h b/src/jitlayers.h index 93669c2351d88..a1d8dbd77cd28 100644 --- a/src/jitlayers.h +++ b/src/jitlayers.h @@ -31,36 +31,12 @@ #include #include -// As of LLVM 13, there are two runtime JIT linker implementations, the older -// RuntimeDyld (used via orc::RTDyldObjectLinkingLayer) and the newer JITLink -// (used via orc::ObjectLinkingLayer). -// -// JITLink is not only more flexible (which isn't of great importance for us, as -// we do only single-threaded in-process codegen), but crucially supports using -// the Small code model, where the linker needs to fix up relocations between -// object files that end up far apart in address space. RuntimeDyld can't do -// that and relies on the Large code model instead, which is broken on -// aarch64-darwin (macOS on ARM64), and not likely to ever be supported there -// (see https://bugs.llvm.org/show_bug.cgi?id=52029). -// -// However, JITLink is a relatively young library and lags behind in platform -// and feature support (e.g. Windows, JITEventListeners for various profilers, -// etc.). Thus, we currently only use JITLink where absolutely required, that is, -// for Mac/aarch64 and Linux/aarch64. -// #define JL_FORCE_JITLINK - #if defined(_COMPILER_ASAN_ENABLED_) || defined(_COMPILER_MSAN_ENABLED_) || defined(_COMPILER_TSAN_ENABLED_) # define HAS_SANITIZER #endif // The sanitizers don't play well with our memory manager -#if defined(JL_FORCE_JITLINK) || defined(_CPU_AARCH64_) || defined(HAS_SANITIZER) -# define JL_USE_JITLINK -#endif - # include -# include -# include using namespace llvm; @@ -359,33 +335,11 @@ using SharedBytesT = StringSet::MapEntryT class JuliaOJIT { public: -#ifdef JL_USE_JITLINK typedef orc::ObjectLinkingLayer ObjLayerT; -#else - typedef orc::RTDyldObjectLinkingLayer ObjLayerT; -#endif - struct LockLayerT : public orc::ObjectLayer { - - LockLayerT(orc::ObjectLayer &BaseLayer) JL_NOTSAFEPOINT : orc::ObjectLayer(BaseLayer.getExecutionSession()), BaseLayer(BaseLayer) {} - ~LockLayerT() JL_NOTSAFEPOINT = default; - - void emit(std::unique_ptr R, - std::unique_ptr O) override { - JL_TIMING(LLVM_JIT, JIT_Link); -#ifndef JL_USE_JITLINK - std::lock_guard lock(EmissionMutex); -#endif - BaseLayer.emit(std::move(R), std::move(O)); - } - private: - orc::ObjectLayer &BaseLayer; - std::mutex EmissionMutex; - }; typedef orc::IRCompileLayer CompileLayerT; typedef orc::IRTransformLayer JITPointersLayerT; typedef orc::IRTransformLayer OptimizeLayerT; typedef orc::IRTransformLayer OptSelLayerT; - typedef orc::IRTransformLayer DepsVerifyLayerT; typedef object::OwningBinary OwningObj; template - void registerObject(const ObjT &Obj, const LoadResult &LO); - public: JuliaOJIT() JL_NOTSAFEPOINT; ~JuliaOJIT() JL_NOTSAFEPOINT; void enableJITDebuggingSupport() JL_NOTSAFEPOINT; -#ifndef JL_USE_JITLINK - // JITLink doesn't support old JITEventListeners (yet). - void RegisterJITEventListener(JITEventListener *L) JL_NOTSAFEPOINT; -#endif + void enableIntelJITEventListener() JL_NOTSAFEPOINT; + void enableOProfileJITEventListener() JL_NOTSAFEPOINT; + void enablePerfJITEventListener() JL_NOTSAFEPOINT; orc::SymbolStringPtr mangle(StringRef Name) JL_NOTSAFEPOINT; void addGlobalMapping(StringRef Name, uint64_t Addr) JL_NOTSAFEPOINT; @@ -610,18 +558,12 @@ class JuliaOJIT { ResourcePool> ContextPool; std::atomic jit_bytes_size{0}; -#ifndef JL_USE_JITLINK - const std::shared_ptr MemMgr; -#else const std::unique_ptr MemMgr; -#endif ObjLayerT ObjectLayer; - LockLayerT LockLayer; CompileLayerT CompileLayer; JITPointersLayerT JITPointersLayer; OptimizeLayerT OptimizeLayer; OptSelLayerT OptSelLayer; - DepsVerifyLayerT DepsVerifyLayer; CompileLayerT ExternalCompileLayer; }; extern JuliaOJIT *jl_ExecutionEngine;