taichi-dev · YuCrazing · Sep 23, 2022 · Sep 22, 2022 · Sep 22, 2022 · Sep 22, 2022
diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
@@ -9,7 +9,7 @@
 # Provide a shortcut to types since they're commonly used.
 from taichi.types.primitive_types import *
 
-from taichi import ad, experimental, graph, linalg, math, tools
+from taichi import ad, algorithms, experimental, graph, linalg, math, tools
 from taichi.ui import GUI, hex_to_rgb, rgb_to_hex, ui
 
 # Issue#2223: Do not reorder, or we're busted with partially initialized module

diff --git a/python/taichi/_kernels.py b/python/taichi/_kernels.py
@@ -292,9 +292,6 @@ def save_texture_to_numpy(tex: texture_type.rw_texture(num_dimensions=2,
 
 
 # Odd-even merge sort
-# References:
-# https://developer.nvidia.com/gpugems/gpugems2/part-vi-simulation-and-numerical-algorithms/chapter-46-improved-gpu-sorting
-# https://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
 @kernel
 def sort_stage(keys: template(), use_values: int, values: template(), N: int,
                p: int, k: int, invocations: int):
@@ -315,26 +312,7 @@ def sort_stage(keys: template(), use_values: int, values: template(), N: int,
                         values[b] = temp
 
 
-def parallel_sort(keys, values=None):
-    N = keys.shape[0]
-
-    num_stages = 0
-    p = 1
-    while p < N:
-        k = p
-        while k >= 1:
-            invocations = int((N - k - k % p) / (2 * k)) + 1
-            if values is None:
-                sort_stage(keys, 0, keys, N, p, k, invocations)
-            else:
-                sort_stage(keys, 1, values, N, p, k, invocations)
-            num_stages += 1
-            sync()
-            k = int(k / 2)
-        p = int(p * 2)
-    print(num_stages)
-
-
+# Parallel Prefix Sum (Scan)
 @func
 def warp_shfl_up_i32(val: template()):
     global_tid = block.global_thread_idx()
@@ -421,53 +399,3 @@ def blit_from_field_to_field(
         dst: template(), src: template(), offset: i32, size: i32):
     for i in range(size):
         dst[i + offset] = src[i]
-
-
-# Parallel Prefix Sum (Scan)
-# Ref[0]: https://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/scan/doc/scan.pdf
-# Ref[1]: https://github.com/NVIDIA/cuda-samples/blob/master/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_scan.cu
-def prefix_sum_inclusive_inplace(input_arr, length):
-    BLOCK_SZ = 64
-    GRID_SZ = int((length + BLOCK_SZ - 1) / BLOCK_SZ)
-
-    # Buffer position and length
-    # This is a single buffer implementation for ease of aot usage
-    ele_num = length
-    ele_nums = [ele_num]
-    start_pos = 0
-    ele_nums_pos = [start_pos]
-
-    while ele_num > 1:
-        ele_num = int((ele_num + BLOCK_SZ - 1) / BLOCK_SZ)
-        ele_nums.append(ele_num)
-        start_pos += BLOCK_SZ * ele_num
-        ele_nums_pos.append(start_pos)
-
-    if input_arr.dtype != i32:
-        raise RuntimeError("Only ti.i32 type is supported for prefix sum.")
-
-    large_arr = field(i32, shape=start_pos)
-
-    if current_cfg().arch == cuda:
-        inclusive_add = warp_shfl_up_i32
-    elif current_cfg().arch == vulkan:
-        inclusive_add = subgroup.inclusive_add
-    else:
-        raise RuntimeError(
-            f"{str(current_cfg().arch)} is not supported for prefix sum.")
-
-    blit_from_field_to_field(large_arr, input_arr, 0, length)
-
-    # Kogge-Stone construction
-    for i in range(len(ele_nums) - 1):
-        if i == len(ele_nums) - 2:
-            scan_add_inclusive(large_arr, ele_nums_pos[i], ele_nums_pos[i + 1],
-                               True, inclusive_add)
-        else:
-            scan_add_inclusive(large_arr, ele_nums_pos[i], ele_nums_pos[i + 1],
-                               False, inclusive_add)
-
-    for i in range(len(ele_nums) - 3, -1, -1):
-        uniform_add(large_arr, ele_nums_pos[i], ele_nums_pos[i + 1])
-
-    blit_from_field_to_field(input_arr, large_arr, 0, length)
diff --git a/python/taichi/algorithms.py b/python/taichi/algorithms.py
@@ -0,0 +1,93 @@
+from taichi._kernels import (blit_from_field_to_field, scan_add_inclusive,
+                             sort_stage, uniform_add, warp_shfl_up_i32)
+from taichi.lang.impl import current_cfg, field
+from taichi.lang.kernel_impl import data_oriented
+from taichi.lang.misc import cuda, vulkan
+from taichi.lang.runtime_ops import sync
+from taichi.lang.simt import subgroup
+from taichi.types.primitive_types import i32
+
+
+# Odd-even merge sort
+# References:
+# https://developer.nvidia.com/gpugems/gpugems2/part-vi-simulation-and-numerical-algorithms/chapter-46-improved-gpu-sorting
+# https://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
+def parallel_sort(keys, values=None):
+    N = keys.shape[0]
+
+    num_stages = 0
+    p = 1
+    while p < N:
+        k = p
+        while k >= 1:
+            invocations = int((N - k - k % p) / (2 * k)) + 1
+            if values is None:
+                sort_stage(keys, 0, keys, N, p, k, invocations)
+            else:
+                sort_stage(keys, 1, values, N, p, k, invocations)
+            num_stages += 1
+            sync()
+            k = int(k / 2)
+        p = int(p * 2)
+    print(num_stages)
+
+
+# Inclusive In-Place's Parallel Prefix Sum (Scan)
+# Ref[0]: https://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/scan/doc/scan.pdf
+# Ref[1]: https://github.com/NVIDIA/cuda-samples/blob/master/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_scan.cu
+@data_oriented
+class PrefixSumExecutor:
+    def __init__(self, length):
+        self.large_arr = None
+        self.sorting_length = length
+
+        BLOCK_SZ = 64
+        GRID_SZ = int((length + BLOCK_SZ - 1) / BLOCK_SZ)
+
+        # Buffer position and length
+        # This is a single buffer implementation for ease of aot usage
+        ele_num = length
+        self.ele_nums = [ele_num]
+        start_pos = 0
+        self.ele_nums_pos = [start_pos]
+
+        while ele_num > 1:
+            ele_num = int((ele_num + BLOCK_SZ - 1) / BLOCK_SZ)
+            self.ele_nums.append(ele_num)
+            start_pos += BLOCK_SZ * ele_num
+            self.ele_nums_pos.append(start_pos)
+
+        self.large_arr = field(i32, shape=start_pos)
+
+    def run(self, input_arr):
+
+        length = self.sorting_length
+        ele_nums = self.ele_nums
+        ele_nums_pos = self.ele_nums_pos
+
+        if input_arr.dtype != i32:
+            raise RuntimeError("Only ti.i32 type is supported for prefix sum.")
+
+        if current_cfg().arch == cuda:
+            inclusive_add = warp_shfl_up_i32
+        elif current_cfg().arch == vulkan:
+            inclusive_add = subgroup.inclusive_add
+        else:
+            raise RuntimeError(
+                f"{str(current_cfg().arch)} is not supported for prefix sum.")
+
+        blit_from_field_to_field(self.large_arr, input_arr, 0, length)
+
+        # Kogge-Stone construction
+        for i in range(len(ele_nums) - 1):
+            if i == len(ele_nums) - 2:
+                scan_add_inclusive(self.large_arr, ele_nums_pos[i],
+                                   ele_nums_pos[i + 1], True, inclusive_add)
+            else:
+                scan_add_inclusive(self.large_arr, ele_nums_pos[i],
+                                   ele_nums_pos[i + 1], False, inclusive_add)
+
+        for i in range(len(ele_nums) - 3, -1, -1):
+            uniform_add(self.large_arr, ele_nums_pos[i], ele_nums_pos[i + 1])
+
+        blit_from_field_to_field(input_arr, self.large_arr, 0, length)
diff --git a/tests/python/test_scan.py b/tests/python/test_scan.py
@@ -15,7 +15,12 @@ def fill():
                 arr_aux[i] = arr[i]
 
         fill()
-        ti._kernels.prefix_sum_inclusive_inplace(arr, N)
+
+        # Performing an inclusive in-place's parallel prefix sum,
+        # only one exectutor is needed for a specified sorting length.
+        executor = ti.algorithms.PrefixSumExecutor(N)
+
+        executor.run(arr)
 
         cur_sum = 0
         for i in range(N):

diff --git a/tests/python/test_sort.py b/tests/python/test_sort.py
@@ -15,7 +15,7 @@ def fill():
                 values[i] = keys[i]
 
         fill()
-        ti._kernels.parallel_sort(keys, values)
+        ti.algorithms.parallel_sort(keys, values)
 
         keys_host = keys.to_numpy()
         values_host = values.to_numpy()