From 837e3d37eaab3b11f76ed2634b8b4fecd28f20ed Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 16 Mar 2022 14:40:18 -0400
Subject: [PATCH] port partr multiq to julia

Direct translation, not necessarily fully idiomatic. In preparation for
future improvements.
---
 base/Base.jl              |   1 +
 base/boot.jl              |   2 +-
 base/partr.jl             | 164 +++++++++++++++++++++++++++++
 base/task.jl              |  40 ++++---
 src/builtins.c            |   5 +-
 src/gc.c                  |   4 -
 src/init.c                |   6 +-
 src/jl_exported_funcs.inc |   1 -
 src/jltypes.c             |  14 ++-
 src/julia.h               |   4 +-
 src/partr.c               | 215 ++++----------------------------------
 src/staticdata.c          |   2 +-
 src/task.c                |   2 +-
 13 files changed, 227 insertions(+), 233 deletions(-)
 create mode 100644 base/partr.jl

diff --git a/base/Base.jl b/base/Base.jl
index ed50dd7a21bc2..42e740d15916c 100644
--- a/base/Base.jl
+++ b/base/Base.jl
@@ -277,6 +277,7 @@ include("condition.jl")
 include("threads.jl")
 include("lock.jl")
 include("channels.jl")
+include("partr.jl")
 include("task.jl")
 include("threads_overloads.jl")
 include("weakkeydict.jl")
diff --git a/base/boot.jl b/base/boot.jl
index d797e0a815a81..c994a491f438c 100644
--- a/base/boot.jl
+++ b/base/boot.jl
@@ -224,7 +224,7 @@ primitive type Char <: AbstractChar 32 end
 primitive type Int8    <: Signed   8 end
 #primitive type UInt8   <: Unsigned 8 end
 primitive type Int16   <: Signed   16 end
-primitive type UInt16  <: Unsigned 16 end
+#primitive type UInt16  <: Unsigned 16 end
 #primitive type Int32   <: Signed   32 end
 #primitive type UInt32  <: Unsigned 32 end
 #primitive type Int64   <: Signed   64 end
diff --git a/base/partr.jl b/base/partr.jl
new file mode 100644
index 0000000000000..d66dc2afcbacd
--- /dev/null
+++ b/base/partr.jl
@@ -0,0 +1,164 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+module Partr
+
+# a task heap
+mutable struct taskheap
+    const lock::ReentrantLock
+    const tasks::Vector{Task}
+    @atomic ntasks::Int32
+    @atomic priority::UInt16
+    taskheap() = new(ReentrantLock(), Vector{Task}(undef, tasks_per_heap), zero(Int32), typemax(UInt16))
+end
+
+# multiqueue parameters
+const heap_d = UInt32(8)
+const heap_c = UInt32(2)
+
+# size of each heap
+const tasks_per_heap = Int32(65536) # TODO: this should be smaller by default, but growable!
+
+# the multiqueue's heaps
+global heaps::Vector{taskheap}
+global heap_p::UInt32 = 0
+
+# unbias state for the RNG
+global cong_unbias::UInt32 = 0
+
+
+cong(max::UInt32, unbias::UInt32) = ccall(:jl_rand_ptls, UInt32, (UInt32, UInt32), max, unbias) + UInt32(1)
+
+function unbias_cong(max::UInt32)
+    return typemax(UInt32) - ((typemax(UInt32) % max) + UInt32(1))
+end
+
+
+function multiq_init(nthreads)
+    global heap_p = heap_c * nthreads
+    global cong_unbias = unbias_cong(UInt32(heap_p))
+    global heaps = Vector{taskheap}(undef, heap_p)
+    for i = UInt32(1):heap_p
+        heaps[i] = taskheap()
+    end
+    nothing
+end
+
+
+function sift_up(heap::taskheap, idx::Int32)
+    while idx > Int32(1)
+        parent = (idx - Int32(2)) ÷ heap_d + Int32(1)
+        if heap.tasks[idx].priority < heap.tasks[parent].priority
+            t = heap.tasks[parent]
+            heap.tasks[parent] = heap.tasks[idx]
+            heap.tasks[idx] = t
+            idx = parent
+        else
+            break
+        end
+    end
+end
+
+
+function sift_down(heap::taskheap, idx::Int32)
+    if idx <= heap.ntasks
+        for child = (heap_d * idx - heap_d + Int32(2)):(heap_d * idx + Int32(1))
+            child > tasks_per_heap && break
+            if isassigned(heap.tasks, child) &&
+                    heap.tasks[child].priority < heap.tasks[idx].priority
+                t = heap.tasks[idx]
+                heap.tasks[idx] = heap.tasks[child]
+                heap.tasks[child] = t
+                sift_down(heap, child)
+            end
+        end
+    end
+end
+
+
+function multiq_insert(task::Task, priority::UInt16)
+    task.priority = priority
+
+    rn = cong(heap_p, cong_unbias)
+    while !trylock(heaps[rn].lock)
+        rn = cong(heap_p, cong_unbias)
+    end
+
+    if heaps[rn].ntasks >= tasks_per_heap
+        unlock(heaps[rn].lock)
+        # multiq insertion failed, increase #tasks per heap
+        return false
+    end
+
+    ntasks = heaps[rn].ntasks + Int32(1)
+    @atomic :monotonic heaps[rn].ntasks = ntasks
+    heaps[rn].tasks[ntasks] = task
+    sift_up(heaps[rn], ntasks)
+    priority = heaps[rn].priority
+    if task.priority < priority
+        @atomic :monotonic heaps[rn].priority = task.priority
+    end
+    unlock(heaps[rn].lock)
+    return true
+end
+
+
+function multiq_deletemin()
+    local rn1, rn2
+    local prio1, prio2
+
+    @label retry
+    GC.safepoint()
+    for i = UInt32(1):heap_p
+        if i == heap_p
+            return nothing
+        end
+        rn1 = cong(heap_p, cong_unbias)
+        rn2 = cong(heap_p, cong_unbias)
+        prio1 = heaps[rn1].priority
+        prio2 = heaps[rn2].priority
+        if prio1 > prio2
+            prio1 = prio2
+            rn1 = rn2
+        elseif prio1 == prio2 && prio1 == typemax(UInt16)
+            continue
+        end
+        if trylock(heaps[rn1].lock)
+            if prio1 == heaps[rn1].priority
+                break
+            end
+            unlock(heaps[rn1].lock)
+        end
+    end
+
+    task = heaps[rn1].tasks[1]
+    tid = Threads.threadid()
+    if ccall(:jl_set_task_tid, Cint, (Any, Cint), task, tid-1) == 0
+        unlock(heaps[rn1].lock)
+        @goto retry
+    end
+    ntasks = heaps[rn1].ntasks
+    @atomic :monotonic heaps[rn1].ntasks = ntasks - Int32(1)
+    heaps[rn1].tasks[1] = heaps[rn1].tasks[ntasks]
+    Base._unsetindex!(heaps[rn1].tasks, Int(ntasks))
+    prio1 = typemax(UInt16)
+    if ntasks > 1
+        sift_down(heaps[rn1], Int32(1))
+        prio1 = heaps[rn1].tasks[1].priority
+    end
+    @atomic :monotonic heaps[rn1].priority = prio1
+    unlock(heaps[rn1].lock)
+
+    return task
+end
+
+
+function multiq_check_empty()
+    for i = UInt32(1):heap_p
+        if heaps[i].ntasks != 0
+            return false
+        end
+    end
+    return true
+end
+
+end
diff --git a/base/task.jl b/base/task.jl
index 90dea508383b1..63ca9ed9b8b27 100644
--- a/base/task.jl
+++ b/base/task.jl
@@ -668,12 +668,14 @@ const StickyWorkqueue = InvasiveLinkedListSynchronized{Task}
 global const Workqueues = [StickyWorkqueue()]
 global const Workqueue = Workqueues[1] # default work queue is thread 1
 function __preinit_threads__()
-    if length(Workqueues) < Threads.nthreads()
-        resize!(Workqueues, Threads.nthreads())
-        for i = 2:length(Workqueues)
+    nt = Threads.nthreads()
+    if length(Workqueues) < nt
+        resize!(Workqueues, nt)
+        for i = 2:nt
             Workqueues[i] = StickyWorkqueue()
         end
     end
+    Partr.multiq_init(nt)
     nothing
 end
 
@@ -698,7 +700,7 @@ function enq_work(t::Task)
         end
         push!(Workqueues[tid], t)
     else
-        if ccall(:jl_enqueue_task, Cint, (Any,), t) != 0
+        if !Partr.multiq_insert(t, t.priority)
             # if multiq is full, give to a random thread (TODO fix)
             if tid == 0
                 tid = mod(time_ns() % Int, Threads.nthreads()) + 1
@@ -864,24 +866,30 @@ function ensure_rescheduled(othertask::Task)
 end
 
 function trypoptask(W::StickyWorkqueue)
-    isempty(W) && return
-    t = popfirst!(W)
-    if t._state !== task_state_runnable
-        # assume this somehow got queued twice,
-        # probably broken now, but try discarding this switch and keep going
-        # can't throw here, because it's probably not the fault of the caller to wait
-        # and don't want to use print() here, because that may try to incur a task switch
-        ccall(:jl_safe_printf, Cvoid, (Ptr{UInt8}, Int32...),
-            "\nWARNING: Workqueue inconsistency detected: popfirst!(Workqueue).state != :runnable\n")
-        return
+    while !isempty(W)
+        t = popfirst!(W)
+        if t._state !== task_state_runnable
+            # assume this somehow got queued twice,
+            # probably broken now, but try discarding this switch and keep going
+            # can't throw here, because it's probably not the fault of the caller to wait
+            # and don't want to use print() here, because that may try to incur a task switch
+            ccall(:jl_safe_printf, Cvoid, (Ptr{UInt8}, Int32...),
+                "\nWARNING: Workqueue inconsistency detected: popfirst!(Workqueue).state != :runnable\n")
+            continue
+        end
+        return t
     end
-    return t
+    return Partr.multiq_deletemin()
+end
+
+function checktaskempty()
+    return Partr.multiq_check_empty()
 end
 
 @noinline function poptask(W::StickyWorkqueue)
     task = trypoptask(W)
     if !(task isa Task)
-        task = ccall(:jl_task_get_next, Ref{Task}, (Any, Any), trypoptask, W)
+        task = ccall(:jl_task_get_next, Ref{Task}, (Any, Any, Any), trypoptask, W, checktaskempty)
     end
     set_next_task(task)
     nothing
diff --git a/src/builtins.c b/src/builtins.c
index f81069424d784..f8a78485551ad 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -2021,10 +2021,11 @@ void jl_init_primitives(void) JL_GC_DISABLED
 
     add_builtin("Bool", (jl_value_t*)jl_bool_type);
     add_builtin("UInt8", (jl_value_t*)jl_uint8_type);
-    add_builtin("Int32", (jl_value_t*)jl_int32_type);
-    add_builtin("Int64", (jl_value_t*)jl_int64_type);
+    add_builtin("UInt16", (jl_value_t*)jl_uint16_type);
     add_builtin("UInt32", (jl_value_t*)jl_uint32_type);
     add_builtin("UInt64", (jl_value_t*)jl_uint64_type);
+    add_builtin("Int32", (jl_value_t*)jl_int32_type);
+    add_builtin("Int64", (jl_value_t*)jl_int64_type);
 #ifdef _P64
     add_builtin("Int", (jl_value_t*)jl_int64_type);
 #else
diff --git a/src/gc.c b/src/gc.c
index 044c9b8fc1446..c21d6ca6c4b7f 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -2820,7 +2820,6 @@ static void jl_gc_queue_thread_local(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp
         gc_mark_queue_obj(gc_cache, sp, ptls2->previous_exception);
 }
 
-void jl_gc_mark_enqueued_tasks(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp);
 extern jl_value_t *cmpswap_names JL_GLOBALLY_ROOTED;
 
 // mark the initial root set
@@ -2829,9 +2828,6 @@ static void mark_roots(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp)
     // modules
     gc_mark_queue_obj(gc_cache, sp, jl_main_module);
 
-    // tasks
-    jl_gc_mark_enqueued_tasks(gc_cache, sp);
-
     // invisible builtin values
     if (jl_an_empty_vec_any != NULL)
         gc_mark_queue_obj(gc_cache, sp, jl_an_empty_vec_any);
diff --git a/src/init.c b/src/init.c
index 6bebffdcf326c..295d76cf16bc2 100644
--- a/src/init.c
+++ b/src/init.c
@@ -773,7 +773,6 @@ static void post_boot_hooks(void)
     jl_char_type    = (jl_datatype_t*)core("Char");
     jl_int8_type    = (jl_datatype_t*)core("Int8");
     jl_int16_type   = (jl_datatype_t*)core("Int16");
-    jl_uint16_type  = (jl_datatype_t*)core("UInt16");
     jl_float16_type = (jl_datatype_t*)core("Float16");
     jl_float32_type = (jl_datatype_t*)core("Float32");
     jl_float64_type = (jl_datatype_t*)core("Float64");
@@ -785,10 +784,11 @@ static void post_boot_hooks(void)
 
     jl_bool_type->super = jl_integer_type;
     jl_uint8_type->super = jl_unsigned_type;
-    jl_int32_type->super = jl_signed_type;
-    jl_int64_type->super = jl_signed_type;
+    jl_uint16_type->super = jl_unsigned_type;
     jl_uint32_type->super = jl_unsigned_type;
     jl_uint64_type->super = jl_unsigned_type;
+    jl_int32_type->super = jl_signed_type;
+    jl_int64_type->super = jl_signed_type;
 
     jl_errorexception_type = (jl_datatype_t*)core("ErrorException");
     jl_stackovf_exception  = jl_new_struct_uninit((jl_datatype_t*)core("StackOverflowError"));
diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc
index 2aed69f47c30a..7d8488bcea73c 100644
--- a/src/jl_exported_funcs.inc
+++ b/src/jl_exported_funcs.inc
@@ -119,7 +119,6 @@
     XX(jl_egal__bits) \
     XX(jl_egal__special) \
     XX(jl_eh_restore_state) \
-    XX(jl_enqueue_task) \
     XX(jl_enter_handler) \
     XX(jl_enter_threaded_region) \
     XX(jl_environ) \
diff --git a/src/jltypes.c b/src/jltypes.c
index 5e84b200af937..29729237a242d 100644
--- a/src/jltypes.c
+++ b/src/jltypes.c
@@ -2144,6 +2144,8 @@ void jl_init_types(void) JL_GC_DISABLED
                                           jl_any_type, jl_emptysvec, 64);
     jl_uint8_type = jl_new_primitivetype((jl_value_t*)jl_symbol("UInt8"), core,
                                          jl_any_type, jl_emptysvec, 8);
+    jl_uint16_type = jl_new_primitivetype((jl_value_t*)jl_symbol("UInt16"), core,
+                                          jl_any_type, jl_emptysvec, 16);
 
     jl_ssavalue_type = jl_new_datatype(jl_symbol("SSAValue"), core, jl_any_type, jl_emptysvec,
                                        jl_perm_symsvec(1, "id"),
@@ -2508,7 +2510,7 @@ void jl_init_types(void) JL_GC_DISABLED
                             "inferred",
                             //"edges",
                             //"absolute_max",
-	                        "ipo_purity_bits", "purity_bits",
+                            "ipo_purity_bits", "purity_bits",
                             "argescapes",
                             "isspecsig", "precompile", "invoke", "specptr", // function object decls
                             "relocatability"),
@@ -2602,7 +2604,7 @@ void jl_init_types(void) JL_GC_DISABLED
                         NULL,
                         jl_any_type,
                         jl_emptysvec,
-                        jl_perm_symsvec(14,
+                        jl_perm_symsvec(15,
                                         "next",
                                         "queue",
                                         "storage",
@@ -2616,8 +2618,9 @@ void jl_init_types(void) JL_GC_DISABLED
                                         "rngState3",
                                         "_state",
                                         "sticky",
-                                        "_isexception"),
-                        jl_svec(14,
+                                        "_isexception",
+                                        "priority"),
+                        jl_svec(15,
                                 jl_any_type,
                                 jl_any_type,
                                 jl_any_type,
@@ -2631,7 +2634,8 @@ void jl_init_types(void) JL_GC_DISABLED
                                 jl_uint64_type,
                                 jl_uint8_type,
                                 jl_bool_type,
-                                jl_bool_type),
+                                jl_bool_type,
+                                jl_uint16_type),
                         jl_emptysvec,
                         0, 1, 6);
     jl_value_t *listt = jl_new_struct(jl_uniontype_type, jl_task_type, jl_nothing_type);
diff --git a/src/julia.h b/src/julia.h
index 3153b87c3a9b9..863e9c8de7cbb 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -1881,12 +1881,12 @@ typedef struct _jl_task_t {
     _Atomic(uint8_t) _state;
     uint8_t sticky; // record whether this Task can be migrated to a new thread
     _Atomic(uint8_t) _isexception; // set if `result` is an exception to throw or that we exited with
+    // multiqueue priority
+    uint16_t priority;
 
 // hidden state:
     // id of owning thread - does not need to be defined until the task runs
     _Atomic(int16_t) tid;
-    // multiqueue priority
-    int16_t prio;
     // saved gc stack top for context switches
     jl_gcframe_t *gcstack;
     size_t world_age;
diff --git a/src/partr.c b/src/partr.c
index c8cc3245ebb4c..74b0e2c8db6a9 100644
--- a/src/partr.c
+++ b/src/partr.c
@@ -34,11 +34,9 @@ static const int16_t sleeping = 1;
 // information: These observations require sequentially-consistent fences to be inserted between each of those operational phases.
 // [^store_buffering_1]: These fences are used to avoid the cycle 2b -> 1a -> 1b -> 2a -> 2b where
 // * Dequeuer:
-//   * 1a: `jl_atomic_store_relaxed(&ptls->sleep_check_state, sleeping)`
-//   * 1b: `multiq_check_empty` returns true
+//   * 1: `jl_atomic_store_relaxed(&ptls->sleep_check_state, sleeping)`
 // * Enqueuer:
-//   * 2a: `multiq_insert`
-//   * 2b: `jl_atomic_load_relaxed(&ptls->sleep_check_state)` in `jl_wakeup_thread` returns `not_sleeping`
+//   * 2: `jl_atomic_load_relaxed(&ptls->sleep_check_state)` in `jl_wakeup_thread` returns `not_sleeping`
 // i.e., the dequeuer misses the enqueue and enqueuer misses the sleep state transition.
 
 
@@ -67,187 +65,21 @@ JL_DLLEXPORT int jl_set_task_tid(jl_task_t *task, int tid) JL_NOTSAFEPOINT
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_mark_sp_t *sp, jl_value_t *obj) JL_NOTSAFEPOINT;
 
-// multiq
-// ---
-
-/* a task heap */
-typedef struct taskheap_tag {
-    uv_mutex_t lock;
-    jl_task_t **tasks;
-    _Atomic(int32_t) ntasks;
-    _Atomic(int16_t) prio;
-} taskheap_t;
-
-/* multiqueue parameters */
-static const int32_t heap_d = 8;
-static const int heap_c = 2;
-
-/* size of each heap */
-static const int tasks_per_heap = 65536; // TODO: this should be smaller by default, but growable!
-
-/* the multiqueue's heaps */
-static taskheap_t *heaps;
-static int32_t heap_p;
-
-/* unbias state for the RNG */
-static uint64_t cong_unbias;
-
-
-static inline void multiq_init(void)
-{
-    heap_p = heap_c * jl_n_threads;
-    heaps = (taskheap_t *)calloc(heap_p, sizeof(taskheap_t));
-    for (int32_t i = 0; i < heap_p; ++i) {
-        uv_mutex_init(&heaps[i].lock);
-        heaps[i].tasks = (jl_task_t **)calloc(tasks_per_heap, sizeof(jl_task_t*));
-        jl_atomic_store_relaxed(&heaps[i].ntasks, 0);
-        jl_atomic_store_relaxed(&heaps[i].prio, INT16_MAX);
-    }
-    unbias_cong(heap_p, &cong_unbias);
-}
-
-
-static inline void sift_up(taskheap_t *heap, int32_t idx)
-{
-    if (idx > 0) {
-        int32_t parent = (idx-1)/heap_d;
-        if (heap->tasks[idx]->prio < heap->tasks[parent]->prio) {
-            jl_task_t *t = heap->tasks[parent];
-            heap->tasks[parent] = heap->tasks[idx];
-            heap->tasks[idx] = t;
-            sift_up(heap, parent);
-        }
-    }
-}
-
-
-static inline void sift_down(taskheap_t *heap, int32_t idx)
-{
-    if (idx < jl_atomic_load_relaxed(&heap->ntasks)) {
-        for (int32_t child = heap_d*idx + 1;
-                child < tasks_per_heap && child <= heap_d*idx + heap_d;
-                ++child) {
-            if (heap->tasks[child]
-                    && heap->tasks[child]->prio < heap->tasks[idx]->prio) {
-                jl_task_t *t = heap->tasks[idx];
-                heap->tasks[idx] = heap->tasks[child];
-                heap->tasks[child] = t;
-                sift_down(heap, child);
-            }
-        }
-    }
-}
-
-
-static inline int multiq_insert(jl_task_t *task, int16_t priority)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    uint64_t rn;
-
-    task->prio = priority;
-    do {
-        rn = cong(heap_p, cong_unbias, &ptls->rngseed);
-    } while (uv_mutex_trylock(&heaps[rn].lock) != 0);
-
-    if (jl_atomic_load_relaxed(&heaps[rn].ntasks) >= tasks_per_heap) {
-        uv_mutex_unlock(&heaps[rn].lock);
-        // multiq insertion failed, increase #tasks per heap
-        return -1;
-    }
-
-    int32_t ntasks = jl_atomic_load_relaxed(&heaps[rn].ntasks);
-    jl_atomic_store_relaxed(&heaps[rn].ntasks, ntasks + 1);
-    heaps[rn].tasks[ntasks] = task;
-    sift_up(&heaps[rn], ntasks);
-    int16_t prio = jl_atomic_load_relaxed(&heaps[rn].prio);
-    if (task->prio < prio)
-        jl_atomic_store_relaxed(&heaps[rn].prio, task->prio);
-    uv_mutex_unlock(&heaps[rn].lock);
-
-    return 0;
-}
 
+// parallel task runtime
+// ---
 
-static inline jl_task_t *multiq_deletemin(void)
+JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max, uint32_t unbias)
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    uint64_t rn1 = 0, rn2;
-    int32_t i;
-    int16_t prio1, prio2;
-    jl_task_t *task;
- retry:
-    jl_gc_safepoint();
-    for (i = 0; i < heap_p; ++i) {
-        rn1 = cong(heap_p, cong_unbias, &ptls->rngseed);
-        rn2 = cong(heap_p, cong_unbias, &ptls->rngseed);
-        prio1 = jl_atomic_load_relaxed(&heaps[rn1].prio);
-        prio2 = jl_atomic_load_relaxed(&heaps[rn2].prio);
-        if (prio1 > prio2) {
-            prio1 = prio2;
-            rn1 = rn2;
-        }
-        else if (prio1 == prio2 && prio1 == INT16_MAX)
-            continue;
-        if (uv_mutex_trylock(&heaps[rn1].lock) == 0) {
-            if (prio1 == jl_atomic_load_relaxed(&heaps[rn1].prio))
-                break;
-            uv_mutex_unlock(&heaps[rn1].lock);
-        }
-    }
-    if (i == heap_p)
-        return NULL;
-
-    task = heaps[rn1].tasks[0];
-    if (!jl_set_task_tid(task, ptls->tid)) {
-        uv_mutex_unlock(&heaps[rn1].lock);
-        goto retry;
-    }
-    int32_t ntasks = jl_atomic_load_relaxed(&heaps[rn1].ntasks) - 1;
-    jl_atomic_store_relaxed(&heaps[rn1].ntasks, ntasks);
-    heaps[rn1].tasks[0] = heaps[rn1].tasks[ntasks];
-    heaps[rn1].tasks[ntasks] = NULL;
-    prio1 = INT16_MAX;
-    if (ntasks > 0) {
-        sift_down(&heaps[rn1], 0);
-        prio1 = heaps[rn1].tasks[0]->prio;
-    }
-    jl_atomic_store_relaxed(&heaps[rn1].prio, prio1);
-    uv_mutex_unlock(&heaps[rn1].lock);
-
-    return task;
-}
-
-
-void jl_gc_mark_enqueued_tasks(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp)
-{
-    int32_t i, j;
-    for (i = 0; i < heap_p; ++i)
-        for (j = 0; j < jl_atomic_load_relaxed(&heaps[i].ntasks); ++j)
-            jl_gc_mark_queue_obj_explicit(gc_cache, sp, (jl_value_t *)heaps[i].tasks[j]);
+    return cong(max, -(uint64_t)-unbias, &ptls->rngseed);
 }
 
-
-static int multiq_check_empty(void)
-{
-    int32_t i;
-    for (i = 0; i < heap_p; ++i) {
-        if (jl_atomic_load_relaxed(&heaps[i].ntasks) != 0)
-            return 0;
-    }
-    return 1;
-}
-
-
-
-// parallel task runtime
-// ---
-
 // initialize the threading infrastructure
 // (used only by the main thread)
 void jl_init_threadinginfra(void)
 {
-    /* initialize the synchronization trees pool and the multiqueue */
-    multiq_init();
+    /* initialize the synchronization trees pool */
 
     sleep_threshold = DEFAULT_THREAD_SLEEP_THRESHOLD;
     char *cp = getenv(THREAD_SLEEP_THRESHOLD_NAME);
@@ -299,18 +131,6 @@ void jl_threadfun(void *arg)
 }
 
 
-// enqueue the specified task for execution
-JL_DLLEXPORT int jl_enqueue_task(jl_task_t *task)
-{
-    char failed;
-    if (multiq_insert(task, task->prio) == -1)
-        failed = 1;
-    failed = 0;
-    JL_PROBE_RT_TASKQ_INSERT(jl_current_task->ptls, task);
-    return failed;
-}
-
-
 int jl_running_under_rr(int recheck)
 {
 #ifdef _OS_LINUX_
@@ -439,21 +259,22 @@ JL_DLLEXPORT void jl_wakeup_thread(int16_t tid)
 }
 
 
-// get the next runnable task from the multiq
+// get the next runnable task
 static jl_task_t *get_next_task(jl_value_t *trypoptask, jl_value_t *q)
 {
     jl_gc_safepoint();
-    jl_value_t *args[2] = { trypoptask, q };
-    jl_task_t *task = (jl_task_t*)jl_apply(args, 2);
+    jl_task_t *task = (jl_task_t*)jl_apply_generic(trypoptask, &q, 1);
     if (jl_typeis(task, jl_task_type)) {
         int self = jl_atomic_load_relaxed(&jl_current_task->tid);
         jl_set_task_tid(task, self);
         return task;
     }
-    task = multiq_deletemin();
-    if (task)
-        JL_PROBE_RT_TASKQ_GET(jl_current_task->ptls, task);
-    return task;
+    return NULL;
+}
+
+static int check_empty(jl_value_t *checkempty)
+{
+    return jl_apply_generic(checkempty, NULL, 0) == jl_true;
 }
 
 static int may_sleep(jl_ptls_t ptls) JL_NOTSAFEPOINT
@@ -468,7 +289,7 @@ static int may_sleep(jl_ptls_t ptls) JL_NOTSAFEPOINT
 
 extern _Atomic(unsigned) _threadedregion;
 
-JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q)
+JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, jl_value_t *checkempty)
 {
     jl_task_t *ct = jl_current_task;
     uint64_t start_cycles = 0;
@@ -480,7 +301,7 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q)
 
         // quick, race-y check to see if there seems to be any stuff in there
         jl_cpu_pause();
-        if (!multiq_check_empty()) {
+        if (!check_empty(checkempty)) {
             start_cycles = 0;
             continue;
         }
@@ -492,7 +313,7 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q)
             jl_atomic_store_relaxed(&ptls->sleep_check_state, sleeping);
             jl_fence(); // [^store_buffering_1]
             JL_PROBE_RT_SLEEP_CHECK_SLEEP(ptls);
-            if (!multiq_check_empty()) { // uses relaxed loads
+            if (!check_empty(checkempty)) { // uses relaxed loads
                 if (jl_atomic_load_relaxed(&ptls->sleep_check_state) != not_sleeping) {
                     jl_atomic_store_relaxed(&ptls->sleep_check_state, not_sleeping); // let other threads know they don't need to wake us
                     JL_PROBE_RT_SLEEP_CHECK_TASKQ_WAKE(ptls);
diff --git a/src/staticdata.c b/src/staticdata.c
index e72f29257ce19..f697bc88e313d 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -164,13 +164,13 @@ jl_value_t **const*const get_tags(void) {
         INSERT_TAG(jl_int64_type);
         INSERT_TAG(jl_bool_type);
         INSERT_TAG(jl_uint8_type);
+        INSERT_TAG(jl_uint16_type);
         INSERT_TAG(jl_uint32_type);
         INSERT_TAG(jl_uint64_type);
         INSERT_TAG(jl_char_type);
         INSERT_TAG(jl_weakref_type);
         INSERT_TAG(jl_int8_type);
         INSERT_TAG(jl_int16_type);
-        INSERT_TAG(jl_uint16_type);
         INSERT_TAG(jl_float16_type);
         INSERT_TAG(jl_float32_type);
         INSERT_TAG(jl_float64_type);
diff --git a/src/task.c b/src/task.c
index 1dd4e76b8ba1c..dbafa0ee29e0a 100644
--- a/src/task.c
+++ b/src/task.c
@@ -798,7 +798,7 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion
     t->gcstack = NULL;
     t->excstack = NULL;
     t->started = 0;
-    t->prio = -1;
+    t->priority = 0;
     jl_atomic_store_relaxed(&t->tid, t->copy_stack ? jl_atomic_load_relaxed(&ct->tid) : -1); // copy_stacks are always pinned since they can't be moved
     t->ptls = NULL;
     t->world_age = ct->world_age;