diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index 2f79d3de97a2d..3c608f358b694 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -144,8 +144,8 @@ The keyword arguments can be any combination of:
     line, `:count` sorts in order of number of collected samples, and `:overhead` sorts by the number of samples
     incurred by each function by itself.
 
- - `groupby` -- Controls grouping over tasks and threads, or no grouping. Options are `:none` (default), `:threads`, `:tasks`,
-    `[:threads, :tasks]`, or `[:tasks, :threads]` where the last two provide nested grouping.
+ - `groupby` -- Controls grouping over tasks and threads, or no grouping. Options are `:none` (default), `:thread`, `:task`,
+    `[:thread, :task]`, or `[:task, :thread]` where the last two provide nested grouping.
 
  - `noisefloor` -- Limits frames that exceed the heuristic noise floor of the sample (only applies to format `:tree`).
     A suggested value to try for this is 2.0 (the default is 0). This parameter hides samples for which `n <= noisefloor * √N`,
@@ -296,7 +296,13 @@ function is_block_end(data, i)
     # and we could have (though very unlikely):
     # 1:<stack><metadata><null><null><NULL><metadata><null><null>:end
     # and we want to ignore the triple NULL (which is an ip).
-    return data[i] == 0 && data[i - 1] == 0 && data[i - 2] != 0
+    data[i] == 0 || return false        # first block end null
+    data[i - 1] == 0 || return false    # second block end null
+    data[i - 2] in 1:2 || return false  # sleep state
+    data[i - 3] != 0 || return false    # cpu_cycle_clock
+    data[i - 4] != 0 || return false    # taskid
+    data[i - 5] != 0 || return false    # threadid
+    return true
 end
 
 """
@@ -519,29 +525,51 @@ function fetch(;include_meta = false)
     GC.@preserve data unsafe_copyto!(pointer(data), get_data_pointer(), len)
     if include_meta || isempty(data)
         return data
-    else
-        nblocks = 0
-        for i = 2:length(data)
-            if is_block_end(data, i) # detect block ends and count them
-                nblocks += 1
-            end
-        end
-        data_stripped = Vector{UInt}(undef, length(data) - (nblocks * (nmeta + 1)))
-        j = length(data_stripped)
-        i = length(data)
-        while i > 0 && j > 0
-            data_stripped[j] = data[i]
-            if is_block_end(data, i)
-                i -= (nmeta + 1) # metadata fields and the extra NULL IP
-            end
-            i -= 1
-            j -= 1
+    end
+    return strip_meta(data)
+end
+
+function strip_meta(data)
+    nblocks = count(Base.Fix1(is_block_end, data), eachindex(data))
+    data_stripped = Vector{UInt}(undef, length(data) - (nblocks * (nmeta + 1)))
+    j = length(data_stripped)
+    i = length(data)
+    while i > 0 && j > 0
+        data_stripped[j] = data[i]
+        if is_block_end(data, i)
+            i -= (nmeta + 1) # metadata fields and the extra NULL IP
         end
-        @assert i == j == 0 "metadata stripping failed i=$i j=$j data[1:i]=$(data[1:i])"
-        return data_stripped
+        i -= 1
+        j -= 1
     end
+    @assert i == j == 0 "metadata stripping failed i=$i j=$j data[1:i]=$(data[1:i])"
+    return data_stripped
 end
 
+"""
+    Profile.add_fake_meta(data; threadid = 1, taskid = 0xf0f0f0f0) -> data_with_meta
+
+The converse of `Profile.fetch(;include_meta = false)`; this will add fake metadata, and can be used
+for compatibility and by packages (e.g., FlameGraphs.jl) that would rather not depend on the internal
+details of the metadata format.
+"""
+function add_fake_meta(data; threadid = 1, taskid = 0xf0f0f0f0)
+    threadid == 0 && error("Fake threadid cannot be 0")
+    taskid == 0 && error("Fake taskid cannot be 0")
+    any(Base.Fix1(is_block_end, data), eachindex(data)) && error("input already has metadata")
+    cpu_clock_cycle = UInt64(99)
+    data_with_meta = similar(data, 0)
+    for i = 1:length(data)
+        val = data[i]
+        if iszero(val)
+            # (threadid, taskid, cpu_cycle_clock, thread_sleeping)
+            push!(data_with_meta, threadid, taskid, cpu_clock_cycle+=1, false+1, 0, 0)
+        else
+            push!(data_with_meta, val)
+        end
+    end
+    return data_with_meta
+end
 
 ## Print as a flat list
 # Counts the number of times each line appears, at any nesting level and at the topmost level
@@ -807,7 +835,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI
     skip = false
     nsleeping = 0
     for i in startframe:-1:1
-        (startframe - 1) >= i >= (startframe - (nmeta + 1)) && continue # skip metadata (its read ahead below) and extra block end NULL IP
+        (startframe - 1) >= i >= (startframe - (nmeta + 1)) && continue # skip metadata (it's read ahead below) and extra block end NULL IP
         ip = all[i]
         if is_block_end(all, i)
             # read metadata
diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl
index 940f1c4478ae3..092372358e07f 100644
--- a/stdlib/Profile/test/runtests.jl
+++ b/stdlib/Profile/test/runtests.jl
@@ -1,6 +1,7 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
 using Test, Profile, Serialization, Logging
+using Base.StackTraces: StackFrame
 
 Profile.clear()
 Profile.init()
@@ -78,7 +79,17 @@ end
     data_with = Profile.fetch(include_meta = true)
     @test data_without[1] == data_with[1]
     @test data_without[end] == data_with[end]
-    @test length(data_without) < length(data_with)
+    nblocks = count(Base.Fix1(Profile.is_block_end, data_with), eachindex(data_with))
+    @test length(data_without) == length(data_with) - nblocks * (Profile.nmeta + 1)
+
+    data_with_fake = Profile.add_fake_meta(data_without)
+    @test_throws "input already has metadata" Profile.add_fake_meta(data_with)
+    data_stripped = Profile.strip_meta(data_with_fake)
+    @test data_stripped == data_without
+    # ideally the test below would be a test for equality, but real sample ips can be nulls, and thus
+    # adding metadata back in can convert those ips to new block ends, and the length is then longer
+    @test length(data_with_fake) >= length(data_with)
+
 end
 
 Profile.clear()
@@ -175,3 +186,36 @@ let cmd = Base.julia_cmd()
     @test success(p)
     @test parse(Int, s) > 100
 end
+
+@testset "FlameGraphs" begin
+    # FlameGraphs makes use of some Profile's internals. Detect possible breakage by mimicking some of its tests.
+    # Breakage is acceptable since these internals are not part of the stable API, but it's better to know, and ideally
+    # should be paired with an issue or PR in FlameGraphs.
+    #
+    # This also improves the thoroughness of our overall Profile tests.
+    stackframe(func, file, line; C=false) = StackFrame(Symbol(func), Symbol(file), line, nothing, C, false, 0)
+
+    backtraces = UInt64[   4, 3, 2, 1,   # order: callees then caller
+                        0, 6, 5, 1,
+                        0, 8, 7,
+                        0, 4, 3, 2, 1,
+                        0]
+    backtraces = Profile.add_fake_meta(backtraces)
+    lidict = Dict{UInt64,StackFrame}(1=>stackframe(:f1, :file1, 1),
+                                     2=>stackframe(:f2, :file1, 5),
+                                     3=>stackframe(:f3, :file2, 1),
+                                     4=>stackframe(:f2, :file1, 15),
+                                     5=>stackframe(:f4, :file1, 20),
+                                     6=>stackframe(:f5, :file3, 1),
+                                     7=>stackframe(:f1, :file1, 2),
+                                     8=>stackframe(:f6, :file3, 10))
+    root = Profile.StackFrameTree{StackFrame}()
+    Profile.tree!(root, backtraces, lidict, #= C =# true, :off)
+    @test length(root.down) == 2
+    for k in keys(root.down)
+        @test k.file == :file1
+        @test k.line ∈ (1, 2)
+    end
+    node = root.down[stackframe(:f1, :file1, 2)]
+    @test only(node.down).first == lidict[8]
+end