diff --git a/NEWS.md b/NEWS.md
index 7ccab87d92019..9e0f2dd1f18ec 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -135,6 +135,9 @@ Standard library changes
   Further, percent utilization is now reported as a total or per-thread, based on whether the thread is idle or not at
   each sample. `Profile.fetch()` by default strips out the new metadata to ensure backwards compatibility with external
   profiling data consumers, but can be included with the `include_meta` kwarg. ([#41742])
+* The new `Profile.Allocs` module allows memory allocations to be profiled. The stack trace, type, and size of each
+  allocation is recorded, and a `sample_rate` argument allows a tunable amount of allocations to be skipped,
+  reducing performance overhead. ([#42768])
 
 #### Random
 
diff --git a/doc/src/manual/profile.md b/doc/src/manual/profile.md
index 5596ebae512aa..91b3c34a4fb20 100644
--- a/doc/src/manual/profile.md
+++ b/doc/src/manual/profile.md
@@ -336,6 +336,31 @@ and how much garbage it collects each time. This can be enabled with
 [`GC.enable_logging(true)`](@ref), which causes Julia to log to stderr every time
 a garbage collection happens.
 
+### Allocation Profiler
+
+The allocation profiler records the stack trace, type, and size of each
+allocation while it is running. It can be invoked with
+[`Profile.Allocs.@profile`](@ref).
+
+This information about the allocations is returned as an array of `Alloc`
+objects, wrapped in an `AllocResults` object. The best way to visualize
+these is currently with the [PProf.jl](https://github.com/JuliaPerf/PProf.jl)
+library, which can visualize the call stacks which are making the most
+allocations.
+
+The allocation profiler does have significant overhead, so a `sample_rate`
+argument can be passed to speed it up by making it skip some allocations.
+Passing `sample_rate=1.0` will make it record everything (which is slow);
+`sample_rate=0.1` will record only 10% of the allocations (faster), etc.
+
+!!! note
+
+    The current implementation of the Allocations Profiler _does not
+    capture all allocations._ You can read more about the missing allocations
+    and the plan to improve this, here: https://github.com/JuliaLang/julia/issues/43688.
+    Calling `Profile.Allocs.fetch()` will print a log line reporting the percentage
+    of missed allocations, so you can understand the accuracy of your profile.
+
 ## External Profiling
 
 Currently Julia supports `Intel VTune`, `OProfile` and `perf` as external profiling tools.
diff --git a/src/Makefile b/src/Makefile
index cda08995f5538..b93415a63e763 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -45,7 +45,7 @@ RUNTIME_SRCS := \
 	jltypes gf typemap smallintset ast builtins module interpreter symbol \
 	dlload sys init task array dump staticdata toplevel jl_uv datatype \
 	simplevector runtime_intrinsics precompile \
-	threading partr stackwalk gc gc-debug gc-pages gc-stacks method \
+	threading partr stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler method \
 	jlapi signal-handling safepoint timing subtype \
 	crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage
 SRCS := jloptions runtime_ccall rtutils
@@ -288,7 +288,7 @@ $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)
 $(BUILDDIR)/dump.o $(BUILDDIR)/dump.dbg.obj: $(addprefix $(SRCDIR)/,common_symbols1.inc common_symbols2.inc builtin_proto.h serialize.h)
 $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc.h
 $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc.h
-$(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc.h
+$(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-alloc-profiler.h
 $(BUILDDIR)/init.o $(BUILDDIR)/init.dbg.obj: $(SRCDIR)/builtin_proto.h
 $(BUILDDIR)/interpreter.o $(BUILDDIR)/interpreter.dbg.obj: $(SRCDIR)/builtin_proto.h
 $(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/codegen_shared.h
diff --git a/src/array.c b/src/array.c
index 104052644428a..d620278e34b14 100644
--- a/src/array.c
+++ b/src/array.c
@@ -508,6 +508,7 @@ JL_DLLEXPORT jl_value_t *jl_alloc_string(size_t len)
         s = jl_gc_big_alloc(ptls, allocsz);
     }
     jl_set_typeof(s, jl_string_type);
+    maybe_record_alloc_to_profile(s, len);
     *(size_t*)s = len;
     jl_string_data(s)[len] = 0;
     return s;
diff --git a/src/gc-alloc-profiler.cpp b/src/gc-alloc-profiler.cpp
new file mode 100644
index 0000000000000..d00b2117e2c04
--- /dev/null
+++ b/src/gc-alloc-profiler.cpp
@@ -0,0 +1,139 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#include "gc-alloc-profiler.h"
+
+#include "julia_internal.h"
+#include "gc.h"
+
+#include <string>
+#include <vector>
+
+using std::string;
+using std::vector;
+
+struct jl_raw_backtrace_t {
+    jl_bt_element_t *data;
+    size_t size;
+};
+
+struct jl_raw_alloc_t {
+    jl_datatype_t *type_address;
+    jl_raw_backtrace_t backtrace;
+    size_t size;
+};
+
+// == These structs define the global singleton profile buffer that will be used by
+// callbacks to store profile results. ==
+struct jl_per_thread_alloc_profile_t {
+    vector<jl_raw_alloc_t> allocs;
+};
+
+struct jl_alloc_profile_t {
+    double sample_rate;
+
+    vector<jl_per_thread_alloc_profile_t> per_thread_profiles;
+};
+
+struct jl_combined_results {
+    vector<jl_raw_alloc_t> combined_allocs;
+};
+
+// == Global variables manipulated by callbacks ==
+
+jl_alloc_profile_t g_alloc_profile;
+int g_alloc_profile_enabled = false;
+jl_combined_results g_combined_results; // Will live forever.
+
+// === stack stuff ===
+
+jl_raw_backtrace_t get_raw_backtrace() {
+    // A single large buffer to record backtraces onto
+    static jl_bt_element_t static_bt_data[JL_MAX_BT_SIZE];
+
+    size_t bt_size = rec_backtrace(static_bt_data, JL_MAX_BT_SIZE, 2);
+
+    // Then we copy only the needed bytes out of the buffer into our profile.
+    size_t bt_bytes = bt_size * sizeof(jl_bt_element_t);
+    jl_bt_element_t *bt_data = (jl_bt_element_t*) malloc(bt_bytes);
+    memcpy(bt_data, static_bt_data, bt_bytes);
+
+    return jl_raw_backtrace_t{
+        bt_data,
+        bt_size
+    };
+}
+
+// == exported interface ==
+
+extern "C" {  // Needed since these functions doesn't take any arguments.
+
+JL_DLLEXPORT void jl_start_alloc_profile(double sample_rate) {
+    // We only need to do this once, the first time this is called.
+    while (g_alloc_profile.per_thread_profiles.size() < jl_n_threads) {
+        g_alloc_profile.per_thread_profiles.push_back(jl_per_thread_alloc_profile_t{});
+    }
+
+    g_alloc_profile.sample_rate = sample_rate;
+    g_alloc_profile_enabled = true;
+}
+
+JL_DLLEXPORT jl_profile_allocs_raw_results_t jl_fetch_alloc_profile() {
+    // combine allocs
+    // TODO: interleave to preserve ordering
+    for (auto& profile : g_alloc_profile.per_thread_profiles) {
+        for (const auto& alloc : profile.allocs) {
+            g_combined_results.combined_allocs.push_back(alloc);
+        }
+
+        profile.allocs.clear();
+    }
+
+    return jl_profile_allocs_raw_results_t{
+        g_combined_results.combined_allocs.data(),
+        g_combined_results.combined_allocs.size(),
+    };
+}
+
+JL_DLLEXPORT void jl_stop_alloc_profile() {
+    g_alloc_profile_enabled = false;
+}
+
+JL_DLLEXPORT void jl_free_alloc_profile() {
+    // Free any allocs that remain in the per-thread profiles, that haven't
+    // been combined yet (which happens in fetch_alloc_profiles()).
+    for (auto& profile : g_alloc_profile.per_thread_profiles) {
+        for (auto alloc : profile.allocs) {
+            free(alloc.backtrace.data);
+        }
+        profile.allocs.clear();
+    }
+
+    // Free the allocs that have been already combined into the combined results object.
+    for (auto alloc : g_combined_results.combined_allocs) {
+        free(alloc.backtrace.data);
+    }
+
+    g_combined_results.combined_allocs.clear();
+}
+
+// == callback called into by the outside ==
+
+void _maybe_record_alloc_to_profile(jl_value_t *val, size_t size) JL_NOTSAFEPOINT {
+    auto& global_profile = g_alloc_profile;
+    auto& profile = global_profile.per_thread_profiles[jl_threadid()];
+
+    auto sample_val = double(rand()) / double(RAND_MAX);
+    auto should_record = sample_val <= global_profile.sample_rate;
+    if (!should_record) {
+        return;
+    }
+
+    auto type = (jl_datatype_t*)jl_typeof(val);
+    profile.allocs.emplace_back(jl_raw_alloc_t{
+        type,
+        get_raw_backtrace(),
+        size
+    });
+}
+
+}  // extern "C"
diff --git a/src/gc-alloc-profiler.h b/src/gc-alloc-profiler.h
new file mode 100644
index 0000000000000..3509b77daa1fc
--- /dev/null
+++ b/src/gc-alloc-profiler.h
@@ -0,0 +1,49 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#ifndef JL_GC_ALLOC_PROFILER_H
+#define JL_GC_ALLOC_PROFILER_H
+
+#include "julia.h"
+#include "ios.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ---------------------------------------------------------------------
+// The public interface to call from Julia for allocations profiling
+// ---------------------------------------------------------------------
+
+// Forward-declaration to avoid depenency in header file.
+struct jl_raw_alloc_t;  // Defined in gc-alloc-profiler.cpp
+
+typedef struct {
+    struct jl_raw_alloc_t *allocs;
+    size_t num_allocs;
+} jl_profile_allocs_raw_results_t;
+
+JL_DLLEXPORT void jl_start_alloc_profile(double sample_rate);
+JL_DLLEXPORT jl_profile_allocs_raw_results_t jl_fetch_alloc_profile(void);
+JL_DLLEXPORT void jl_stop_alloc_profile(void);
+JL_DLLEXPORT void jl_free_alloc_profile(void);
+
+// ---------------------------------------------------------------------
+// Functions to call from GC when alloc profiling is enabled
+// ---------------------------------------------------------------------
+
+void _maybe_record_alloc_to_profile(jl_value_t *val, size_t size) JL_NOTSAFEPOINT;
+
+extern int g_alloc_profile_enabled;
+
+static inline void maybe_record_alloc_to_profile(jl_value_t *val, size_t size) JL_NOTSAFEPOINT {
+    if (__unlikely(g_alloc_profile_enabled)) {
+        _maybe_record_alloc_to_profile(val, size);
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif  // JL_GC_ALLOC_PROFILER_H
diff --git a/src/gc.h b/src/gc.h
index f34ddaee5ee64..544486d933e10 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -24,6 +24,7 @@
 #endif
 #endif
 #include "julia_assert.h"
+#include "gc-alloc-profiler.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/julia_internal.h b/src/julia_internal.h
index a8c289f14a145..e13b0853ec3af 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -9,6 +9,7 @@
 #include "support/hashing.h"
 #include "support/ptrhash.h"
 #include "support/strtod.h"
+#include "gc-alloc-profiler.h"
 #include <uv.h>
 #if !defined(_WIN32)
 #include <unistd.h>
@@ -364,6 +365,7 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
         v = jl_gc_big_alloc(ptls, allocsz);
     }
     jl_set_typeof(v, ty);
+    maybe_record_alloc_to_profile(v, sz);
     return v;
 }
 
diff --git a/stdlib/Profile/docs/src/index.md b/stdlib/Profile/docs/src/index.md
index ac60bb92cb5ed..89894723b1116 100644
--- a/stdlib/Profile/docs/src/index.md
+++ b/stdlib/Profile/docs/src/index.md
@@ -1,5 +1,7 @@
 # [Profiling](@id lib-profiling)
 
+## CPU Profiling
+
 ```@docs
 Profile.@profile
 ```
@@ -15,3 +17,18 @@ Profile.retrieve
 Profile.callers
 Profile.clear_malloc_data
 ```
+
+## Memory profiling
+
+```@docs
+Profile.Allocs.@profile
+```
+
+The methods in `Profile.Allocs` are not exported and need to be called e.g. as `Profile.Allocs.fetch()`.
+
+```@docs
+Profile.Allocs.clear
+Profile.Allocs.fetch
+Profile.Allocs.start
+Profile.Allocs.stop
+```
diff --git a/stdlib/Profile/src/Allocs.jl b/stdlib/Profile/src/Allocs.jl
new file mode 100644
index 0000000000000..922b31932cb0f
--- /dev/null
+++ b/stdlib/Profile/src/Allocs.jl
@@ -0,0 +1,230 @@
+module Allocs
+
+using Base.StackTraces: StackTrace, StackFrame, lookup
+using Base: InterpreterIP
+
+# --- Raw results structs, originally defined in C ---
+
+# The C jl_bt_element_t object contains either an IP pointer (size_t) or a void*.
+const BTElement = Csize_t;
+
+# matches jl_raw_backtrace_t on the C side
+struct RawBacktrace
+    data::Ptr{BTElement} # in C: *jl_bt_element_t
+    size::Csize_t
+end
+
+# matches jl_raw_alloc_t on the C side
+struct RawAlloc
+    type::Ptr{Type}
+    backtrace::RawBacktrace
+    size::Csize_t
+end
+
+# matches jl_profile_allocs_raw_results_t on the C side
+struct RawResults
+    allocs::Ptr{RawAlloc}
+    num_allocs::Csize_t
+end
+
+"""
+    Profile.Allocs.@profile [sample_rate=0.0001] expr
+
+Profile allocations that happen during `expr`, returning
+both the result and and AllocResults struct.
+
+A sample rate of 1.0 will record everything; 0.0 will record nothing.
+
+```julia
+julia> Profile.Allocs.@profile sample_rate=0.01 peakflops()
+1.03733270279065e11
+
+julia> results = Profile.Allocs.fetch()
+
+julia> last(sort(results.allocs, by=x->x.size))
+Profile.Allocs.Alloc(Vector{Any}, Base.StackTraces.StackFrame[_new_array_ at array.c:127, ...], 5576)
+```
+"""
+macro profile(opts, ex)
+    _prof_expr(ex, opts)
+end
+macro profile(ex)
+    _prof_expr(ex, :(sample_rate=0.0001))
+end
+
+# globals used for tracking how many allocs we're missing
+# vs the alloc counters used by @time
+const _g_gc_num_before = Ref{Base.GC_Num}()
+const _g_sample_rate = Ref{Real}()
+const _g_expected_sampled_allocs = Ref{Float64}(0)
+
+function _prof_expr(expr, opts)
+    quote
+        $start(; $(esc(opts)))
+        local res = $(esc(expr))
+        $stop()
+        res
+    end
+end
+
+"""
+    Profile.Allocs.start(sample_rate::Real)
+
+Begin recording allocations with the given sample rate
+A sample rate of 1.0 will record everything; 0.0 will record nothing.
+"""
+function start(; sample_rate::Real)
+    ccall(:jl_start_alloc_profile, Cvoid, (Cdouble,), Float64(sample_rate))
+
+    _g_sample_rate[] = sample_rate
+    _g_gc_num_before[] = Base.gc_num()
+end
+
+"""
+    Profile.Allocs.stop()
+
+Stop recording allocations.
+"""
+function stop()
+    ccall(:jl_stop_alloc_profile, Cvoid, ())
+
+    # increment a counter of how many allocs we would expect
+    # the memory profiler to see, based on how many allocs
+    # actually happened.
+    gc_num_after = Base.gc_num()
+    gc_diff = Base.GC_Diff(gc_num_after, _g_gc_num_before[])
+    alloc_count = Base.gc_alloc_count(gc_diff)
+    expected_samples = alloc_count * _g_sample_rate[]
+    _g_expected_sampled_allocs[] += expected_samples
+end
+
+"""
+    Profile.Allocs.clear()
+
+Clear all previously profiled allocation information from memory.
+"""
+function clear()
+    ccall(:jl_free_alloc_profile, Cvoid, ())
+
+    _g_expected_sampled_allocs[] = 0
+    return nothing
+end
+
+"""
+    Profile.Allocs.fetch()
+
+Retrieve the recorded allocations, and decode them into Julia
+objects which can be analyzed.
+"""
+function fetch()
+    raw_results = ccall(:jl_fetch_alloc_profile, RawResults, ())
+    decoded_results = decode(raw_results)
+
+    # avoid divide-by-0 errors
+    if _g_expected_sampled_allocs[] > 0
+        missed_allocs = max(0, _g_expected_sampled_allocs[] - length(decoded_results.allocs))
+        missed_percentage = max(0, round(Int, missed_allocs / _g_expected_sampled_allocs[] * 100))
+        if missed_percentage > 0
+            @warn("The allocation profiler is not fully implemented, and missed approximately" *
+            " $(missed_percentage)% (estimated $(round(Int, missed_allocs)) / $(round(Int,
+            _g_expected_sampled_allocs[]))) " *
+                    "of sampled allocs in the last run. " *
+                    "For more info see https://github.com/JuliaLang/julia/issues/43688")
+        else
+            @warn("The allocation profiler is not fully implemented, and may have missed" *
+            " some of the allocs. " *
+                    "For more info see https://github.com/JuliaLang/julia/issues/43688")
+        end
+    end
+    return decoded_results
+end
+
+# decoded results
+
+struct Alloc
+    type::Any
+    stacktrace::StackTrace
+    size::Int
+end
+
+struct AllocResults
+    allocs::Vector{Alloc}
+end
+
+# Without this, the Alloc's stacktrace prints for lines and lines and lines...
+function Base.show(io::IO, a::Alloc)
+    stacktrace_sample = length(a.stacktrace) >= 1 ? "$(a.stacktrace[1]), ..." : ""
+    print(io, "$Alloc($(a.type), $StackFrame[$stacktrace_sample], $(a.size))")
+end
+
+const BacktraceCache = Dict{BTElement,Vector{StackFrame}}
+
+# copied from julia_internal.h
+const JL_BUFF_TAG = UInt(0x4eadc000)
+
+struct CorruptType end
+struct BufferType end
+
+function load_type(ptr::Ptr{Type})
+    if UInt(ptr) < UInt(4096)
+        return CorruptType
+    elseif UInt(ptr) == JL_BUFF_TAG
+        return BufferType
+    end
+    return unsafe_pointer_to_objref(ptr)
+end
+
+function decode_alloc(cache::BacktraceCache, raw_alloc::RawAlloc)::Alloc
+    Alloc(
+        load_type(raw_alloc.type),
+        stacktrace_memoized(cache, load_backtrace(raw_alloc.backtrace)),
+        UInt(raw_alloc.size)
+    )
+end
+
+function decode(raw_results::RawResults)::AllocResults
+    cache = BacktraceCache()
+    allocs = [
+        decode_alloc(cache, unsafe_load(raw_results.allocs, i))
+        for i in 1:raw_results.num_allocs
+    ]
+    return AllocResults(allocs)
+end
+
+function load_backtrace(trace::RawBacktrace)::Vector{BTElement}
+    out = Vector{BTElement}()
+    for i in 1:trace.size
+        push!(out, unsafe_load(trace.data, i))
+    end
+
+    return out
+end
+
+function stacktrace_memoized(
+    cache::BacktraceCache,
+    trace::Vector{BTElement},
+    c_funcs::Bool=true
+)::StackTrace
+    stack = StackTrace()
+    for ip in trace
+        frames = get(cache, ip) do
+            res = lookup(ip)
+            cache[ip] = res
+            return res
+        end
+        for frame in frames
+            # Skip frames that come from C calls.
+            if c_funcs || !frame.from_c
+                push!(stack, frame)
+            end
+        end
+    end
+    return stack
+end
+
+# Precompile once for the package cache.
+@assert precompile(start, ())
+@assert precompile(stop, ())
+@assert precompile(fetch, ())
+
+end
diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index 3409e79bdb128..f969df5ed040e 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -1139,4 +1139,6 @@ function warning_empty(;summary = false)
     end
 end
 
+include("Allocs.jl")
+
 end # module
diff --git a/stdlib/Profile/test/allocs.jl b/stdlib/Profile/test/allocs.jl
new file mode 100644
index 0000000000000..4eae5b0cf68c1
--- /dev/null
+++ b/stdlib/Profile/test/allocs.jl
@@ -0,0 +1,147 @@
+using Test
+using Profile: Allocs
+
+@testset "alloc profiler doesn't segfault" begin
+    res = Allocs.@profile sample_rate=1.0 begin
+        # test the allocations during compilation
+        using Base64
+    end
+    profile = Allocs.fetch()
+
+    @test length(profile.allocs) > 0
+    first_alloc = profile.allocs[1]
+    @test first_alloc.size > 0
+    @test length(first_alloc.stacktrace) > 0
+    @test length(string(first_alloc.type)) > 0
+end
+
+@testset "alloc profiler works when there are multiple tasks on multiple threads" begin
+    NUM_TASKS = 1000
+
+    # This test is only really meaningful if we're running on
+    # multiple threads, but this isn't true on the windows tests,
+    # causing them to fail. So, commenting this assertion out.
+    # @test Threads.nthreads() > 1
+
+    function do_work()
+        ch = Channel{Vector{Float64}}(Inf)
+        @sync for i in 1:NUM_TASKS
+            Threads.@spawn begin
+                # generate garbage
+                put!(ch, zeros(100))
+            end
+        end
+        close(ch)
+    end
+
+    # call once to make sure it's compiled
+    precompile(do_work, ())
+    do_work()
+
+    res = Allocs.@profile sample_rate=1 begin
+        do_work()
+    end
+    profile = Allocs.fetch()
+
+    # expecting at least 2 allocations per task:
+    # 1. the task
+    # 2. the vector
+    @test length(profile.allocs) >= 2*NUM_TASKS
+    first_alloc = profile.allocs[1]
+    @test first_alloc.size > 0
+    @test length(first_alloc.stacktrace) > 0
+    @test length(string(first_alloc.type)) > 0
+
+    @testset for type in (Task, Vector{Float64},)
+        @test length(filter(a->a.type <: type, profile.allocs)) >= NUM_TASKS
+    end
+
+    # TODO: it would be nice to assert that these tasks
+    # were actually scheduled onto multiple threads,
+    # and we see allocs from all threads in the profile
+end
+
+@testset "alloc profiler start stop fetch clear" begin
+    function do_work()
+        # Compiling allocates a lot
+        for f in (gensym() for _ in 1:10)
+            @eval begin
+                $f() = 10
+                $f()
+            end
+        end
+    end
+
+    Allocs.@profile sample_rate=1 do_work()
+    @test length(Allocs.fetch().allocs) > 10
+
+    Allocs.clear()
+    @test length(Allocs.fetch().allocs) == 0
+    Allocs.clear()
+    @test length(Allocs.fetch().allocs) == 0
+
+    Allocs.@profile sample_rate=1 do_work()
+    curr_allocs = length(Allocs.fetch().allocs)
+    @test curr_allocs > 10
+
+    # Do _more_ work, adding into the same profile
+    Allocs.@profile sample_rate=1 do_work()
+    @test length(Allocs.fetch().allocs) > curr_allocs
+
+    Allocs.clear()
+    @test length(Allocs.fetch().allocs) == 0
+
+    # Clear without fetching
+
+    Allocs.@profile sample_rate=1 do_work()
+    Allocs.clear()
+    @test length(Allocs.fetch().allocs) == 0
+
+    # And things still work like normal afterwards
+
+    Allocs.@profile sample_rate=1 do_work()
+    Allocs.@profile sample_rate=1 do_work()
+    Allocs.@profile sample_rate=1 do_work()
+    @test length(Allocs.fetch().allocs) > 10
+
+    Allocs.@profile sample_rate=1 do_work()
+    Allocs.@profile sample_rate=1 do_work()
+    @test length(Allocs.fetch().allocs) > 10
+
+    Allocs.clear()
+end
+
+@testset "alloc profiler warning message" begin
+    @testset "no allocs" begin
+        Profile.Allocs.clear()
+        Profile.Allocs.fetch()
+    end
+    @testset "catches all allocations" begin
+        foo() = []
+        precompile(foo, ())
+        Profile.Allocs.clear()
+        Profile.Allocs.@profile sample_rate=1 foo()
+        # Fake that we expected exactly 1 alloc, since we should have recorded >= 1
+        Profile.Allocs._g_expected_sampled_allocs[] = 1
+        @assert length(Profile.Allocs.fetch().allocs) >= 1
+    end
+    @testset "misses some allocations" begin
+        foo() = []
+        precompile(foo, ())
+        Profile.Allocs.clear()
+        Profile.Allocs.@profile sample_rate=1 foo()
+        # Fake some allocs that we missed, to force the print statement
+        Profile.Allocs._g_expected_sampled_allocs[] += 10
+        @assert 1 <= length(Profile.Allocs.fetch().allocs) < 10
+    end
+end
+
+@testset "alloc profiler catches strings" begin
+    Allocs.@profile sample_rate=1 "$(rand())"
+
+    prof = Allocs.fetch()
+    Allocs.clear()
+
+    @test length(prof.allocs) >= 1
+    @test length([a for a in prof.allocs if a.type == String]) >= 1
+end
diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl
index ac7c8baefe09e..ddddac55fcd93 100644
--- a/stdlib/Profile/test/runtests.jl
+++ b/stdlib/Profile/test/runtests.jl
@@ -219,3 +219,5 @@ end
     node = root.down[stackframe(:f1, :file1, 2)]
     @test only(node.down).first == lidict[8]
 end
+
+include("allocs.jl")