From 745ec38d4e10bdde64df4d94798d088149770fad Mon Sep 17 00:00:00 2001 From: Lukas Schwerdt Date: Mon, 30 Jan 2023 15:56:23 +0100 Subject: [PATCH 1/3] Faster radix sort Unrolling loop in Base.Sort.radix_sort_pass! increases speed by about 50%. --- base/sort.jl | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/base/sort.jl b/base/sort.jl index 985e0e8f597f3..c19b5f265eaf0 100644 --- a/base/sort.jl +++ b/base/sort.jl @@ -1112,6 +1112,13 @@ function radix_sort!(v::AbstractVector{U}, lo::Integer, hi::Integer, bits::Unsig shift < bits || return true end end + +macro repeat4x(a) + quote + $a; $a; $a; $a + end |> esc +end + function radix_sort_pass!(t, lo, hi, offset, counts, v, shift, chunk_size) mask = UInt(1) << chunk_size - 1 # mask is defined in pass so that the compiler @inbounds begin # ↳ knows it's shape @@ -1130,14 +1137,27 @@ function radix_sort_pass!(t, lo, hi, offset, counts, v, shift, chunk_size) # belongs, not the number of elements in each bucket. We will put the first element # of bucket 0x00 in t[counts[1]], the next element of bucket 0x00 in t[counts[1]+1], # and the last element of bucket 0x00 in t[counts[2]-1]. - - for k in lo:hi + + #loop unrolled 4x + k = lo + while k <= hi - 4 + @repeat4x begin + x = v[k] # lookup the element + i = (x >> shift)&mask + 1 # compute its bucket's index for this pass + j = counts[i] # lookup the target index + t[j + offset] = x # put the element where it belongs + counts[i] = j + 1 # increment the target index for the next + k += 1 # ↳ element in this bucket + end + end + while k <= hi x = v[k] # lookup the element i = (x >> shift)&mask + 1 # compute its bucket's index for this pass j = counts[i] # lookup the target index t[j + offset] = x # put the element where it belongs counts[i] = j + 1 # increment the target index for the next - end # ↳ element in this bucket + k += 1 # ↳ element in this bucket + end end end function radix_chunk_size_heuristic(lo::Integer, hi::Integer, bits::Unsigned) From 62d234b4dda41acea8f89749b338d05669a2e3c8 Mon Sep 17 00:00:00 2001 From: Lukas Schwerdt Date: Mon, 30 Jan 2023 17:43:00 +0100 Subject: [PATCH 2/3] Remove superfluous white space --- base/sort.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/base/sort.jl b/base/sort.jl index c19b5f265eaf0..42b0da5decb34 100644 --- a/base/sort.jl +++ b/base/sort.jl @@ -1137,7 +1137,7 @@ function radix_sort_pass!(t, lo, hi, offset, counts, v, shift, chunk_size) # belongs, not the number of elements in each bucket. We will put the first element # of bucket 0x00 in t[counts[1]], the next element of bucket 0x00 in t[counts[1]+1], # and the last element of bucket 0x00 in t[counts[2]-1]. - + #loop unrolled 4x k = lo while k <= hi - 4 @@ -1149,7 +1149,7 @@ function radix_sort_pass!(t, lo, hi, offset, counts, v, shift, chunk_size) counts[i] = j + 1 # increment the target index for the next k += 1 # ↳ element in this bucket end - end + end while k <= hi x = v[k] # lookup the element i = (x >> shift)&mask + 1 # compute its bucket's index for this pass From cb97f9fd125fa45de56e775c5bff179765a6027f Mon Sep 17 00:00:00 2001 From: Lukas Schwerdt Date: Mon, 30 Jan 2023 17:43:50 +0100 Subject: [PATCH 3/3] Use Cartesian.@nexprs instead of custom macro --- base/sort.jl | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/base/sort.jl b/base/sort.jl index 42b0da5decb34..00ed8fca59a3d 100644 --- a/base/sort.jl +++ b/base/sort.jl @@ -5,7 +5,7 @@ module Sort using Base.Order using Base: copymutable, midpoint, require_one_based_indexing, uinttype, - sub_with_overflow, add_with_overflow, OneTo, BitSigned, BitIntegerType, top_set_bit + sub_with_overflow, add_with_overflow, OneTo, BitSigned, BitIntegerType, top_set_bit, Cartesian import Base: sort, @@ -1112,13 +1112,6 @@ function radix_sort!(v::AbstractVector{U}, lo::Integer, hi::Integer, bits::Unsig shift < bits || return true end end - -macro repeat4x(a) - quote - $a; $a; $a; $a - end |> esc -end - function radix_sort_pass!(t, lo, hi, offset, counts, v, shift, chunk_size) mask = UInt(1) << chunk_size - 1 # mask is defined in pass so that the compiler @inbounds begin # ↳ knows it's shape @@ -1141,7 +1134,7 @@ function radix_sort_pass!(t, lo, hi, offset, counts, v, shift, chunk_size) #loop unrolled 4x k = lo while k <= hi - 4 - @repeat4x begin + Cartesian.@nexprs 4 _ -> begin x = v[k] # lookup the element i = (x >> shift)&mask + 1 # compute its bucket's index for this pass j = counts[i] # lookup the target index