From 7e679bfa174203fdfed4a10af578d61bf1b7df3c Mon Sep 17 00:00:00 2001
From: yucrazing <723284893@qq.com>
Date: Thu, 22 Sep 2022 13:22:53 +0800
Subject: [PATCH 01/12] Add prefix sum executor

---
 python/taichi/__init__.py   |  2 +-
 python/taichi/_kernels.py   | 74 +-----------------------------
 python/taichi/algorithms.py | 89 +++++++++++++++++++++++++++++++++++++
 tests/python/test_scan.py   |  6 ++-
 tests/python/test_sort.py   |  2 +-
 5 files changed, 97 insertions(+), 76 deletions(-)
 create mode 100644 python/taichi/algorithms.py

diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
index 8870837b8d8c3..b55ca644183d5 100644
--- a/python/taichi/__init__.py
+++ b/python/taichi/__init__.py
@@ -9,7 +9,7 @@
 # Provide a shortcut to types since they're commonly used.
 from taichi.types.primitive_types import *
 
-from taichi import ad, experimental, graph, linalg, math, tools
+from taichi import ad, algorithms, experimental, graph, linalg, math, tools
 from taichi.ui import GUI, hex_to_rgb, rgb_to_hex, ui
 
 # Issue#2223: Do not reorder, or we're busted with partially initialized module
diff --git a/python/taichi/_kernels.py b/python/taichi/_kernels.py
index f1d81f3856de9..201ac4078d929 100644
--- a/python/taichi/_kernels.py
+++ b/python/taichi/_kernels.py
@@ -292,9 +292,6 @@ def save_texture_to_numpy(tex: texture_type.rw_texture(num_dimensions=2,
 
 
 # Odd-even merge sort
-# References:
-# https://developer.nvidia.com/gpugems/gpugems2/part-vi-simulation-and-numerical-algorithms/chapter-46-improved-gpu-sorting
-# https://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
 @kernel
 def sort_stage(keys: template(), use_values: int, values: template(), N: int,
                p: int, k: int, invocations: int):
@@ -315,26 +312,7 @@ def sort_stage(keys: template(), use_values: int, values: template(), N: int,
                         values[b] = temp
 
 
-def parallel_sort(keys, values=None):
-    N = keys.shape[0]
-
-    num_stages = 0
-    p = 1
-    while p < N:
-        k = p
-        while k >= 1:
-            invocations = int((N - k - k % p) / (2 * k)) + 1
-            if values is None:
-                sort_stage(keys, 0, keys, N, p, k, invocations)
-            else:
-                sort_stage(keys, 1, values, N, p, k, invocations)
-            num_stages += 1
-            sync()
-            k = int(k / 2)
-        p = int(p * 2)
-    print(num_stages)
-
-
+# Parallel Prefix Sum (Scan)
 @func
 def warp_shfl_up_i32(val: template()):
     global_tid = block.global_thread_idx()
@@ -421,53 +399,3 @@ def blit_from_field_to_field(
         dst: template(), src: template(), offset: i32, size: i32):
     for i in range(size):
         dst[i + offset] = src[i]
-
-
-# Parallel Prefix Sum (Scan)
-# Ref[0]: https://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/scan/doc/scan.pdf
-# Ref[1]: https://github.com/NVIDIA/cuda-samples/blob/master/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_scan.cu
-def prefix_sum_inclusive_inplace(input_arr, length):
-    BLOCK_SZ = 64
-    GRID_SZ = int((length + BLOCK_SZ - 1) / BLOCK_SZ)
-
-    # Buffer position and length
-    # This is a single buffer implementation for ease of aot usage
-    ele_num = length
-    ele_nums = [ele_num]
-    start_pos = 0
-    ele_nums_pos = [start_pos]
-
-    while ele_num > 1:
-        ele_num = int((ele_num + BLOCK_SZ - 1) / BLOCK_SZ)
-        ele_nums.append(ele_num)
-        start_pos += BLOCK_SZ * ele_num
-        ele_nums_pos.append(start_pos)
-
-    if input_arr.dtype != i32:
-        raise RuntimeError("Only ti.i32 type is supported for prefix sum.")
-
-    large_arr = field(i32, shape=start_pos)
-
-    if current_cfg().arch == cuda:
-        inclusive_add = warp_shfl_up_i32
-    elif current_cfg().arch == vulkan:
-        inclusive_add = subgroup.inclusive_add
-    else:
-        raise RuntimeError(
-            f"{str(current_cfg().arch)} is not supported for prefix sum.")
-
-    blit_from_field_to_field(large_arr, input_arr, 0, length)
-
-    # Kogge-Stone construction
-    for i in range(len(ele_nums) - 1):
-        if i == len(ele_nums) - 2:
-            scan_add_inclusive(large_arr, ele_nums_pos[i], ele_nums_pos[i + 1],
-                               True, inclusive_add)
-        else:
-            scan_add_inclusive(large_arr, ele_nums_pos[i], ele_nums_pos[i + 1],
-                               False, inclusive_add)
-
-    for i in range(len(ele_nums) - 3, -1, -1):
-        uniform_add(large_arr, ele_nums_pos[i], ele_nums_pos[i + 1])
-
-    blit_from_field_to_field(input_arr, large_arr, 0, length)
diff --git a/python/taichi/algorithms.py b/python/taichi/algorithms.py
new file mode 100644
index 0000000000000..2ba68a930c008
--- /dev/null
+++ b/python/taichi/algorithms.py
@@ -0,0 +1,89 @@
+from taichi.types.primitive_types import i32
+from taichi.lang.impl import current_cfg, field
+from taichi.lang.kernel_impl import data_oriented
+from taichi.lang.runtime_ops import sync
+from taichi.lang.simt import subgroup
+from taichi.lang.misc import cuda, vulkan
+from taichi._kernels import sort_stage
+from taichi._kernels import warp_shfl_up_i32, blit_from_field_to_field, scan_add_inclusive, uniform_add
+
+# Odd-even merge sort
+# References:
+# https://developer.nvidia.com/gpugems/gpugems2/part-vi-simulation-and-numerical-algorithms/chapter-46-improved-gpu-sorting
+# https://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
+def parallel_sort(keys, values=None):
+    N = keys.shape[0]
+
+    num_stages = 0
+    p = 1
+    while p < N:
+        k = p
+        while k >= 1:
+            invocations = int((N - k - k % p) / (2 * k)) + 1
+            if values is None:
+                sort_stage(keys, 0, keys, N, p, k, invocations)
+            else:
+                sort_stage(keys, 1, values, N, p, k, invocations)
+            num_stages += 1
+            sync()
+            k = int(k / 2)
+        p = int(p * 2)
+    print(num_stages)
+
+
+# Parallel Prefix Sum (Scan)
+# Ref[0]: https://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/scan/doc/scan.pdf
+# Ref[1]: https://github.com/NVIDIA/cuda-samples/blob/master/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_scan.cu
+@data_oriented
+class PrefixSumExecutor:
+    def __init__(self):
+        self.large_arr = None
+        self.sorting_length = -1
+
+    def prefix_sum_inclusive_inplace(self, input_arr, length):
+        BLOCK_SZ = 64
+        GRID_SZ = int((length + BLOCK_SZ - 1) / BLOCK_SZ)
+
+        # Buffer position and length
+        # This is a single buffer implementation for ease of aot usage
+        ele_num = length
+        ele_nums = [ele_num]
+        start_pos = 0
+        ele_nums_pos = [start_pos]
+
+        while ele_num > 1:
+            ele_num = int((ele_num + BLOCK_SZ - 1) / BLOCK_SZ)
+            ele_nums.append(ele_num)
+            start_pos += BLOCK_SZ * ele_num
+            ele_nums_pos.append(start_pos)
+
+        if input_arr.dtype != i32:
+            raise RuntimeError("Only ti.i32 type is supported for prefix sum.")
+
+        if self.large_arr is None or self.sorting_length != length:
+            self.large_arr = field(i32, shape=start_pos)
+            self.sorting_length = length
+
+        if current_cfg().arch == cuda:
+            inclusive_add = warp_shfl_up_i32
+        elif current_cfg().arch == vulkan:
+            inclusive_add = subgroup.inclusive_add
+        else:
+            raise RuntimeError(
+                f"{str(current_cfg().arch)} is not supported for prefix sum.")
+
+        blit_from_field_to_field(self.large_arr, input_arr, 0, length)
+
+        # Kogge-Stone construction
+        for i in range(len(ele_nums) - 1):
+            if i == len(ele_nums) - 2:
+                scan_add_inclusive(self.large_arr, ele_nums_pos[i], ele_nums_pos[i + 1],
+                                True, inclusive_add)
+            else:
+                scan_add_inclusive(self.large_arr, ele_nums_pos[i], ele_nums_pos[i + 1],
+                                False, inclusive_add)
+
+        for i in range(len(ele_nums) - 3, -1, -1):
+            uniform_add(self.large_arr, ele_nums_pos[i], ele_nums_pos[i + 1])
+
+        blit_from_field_to_field(input_arr, self.large_arr, 0, length)
diff --git a/tests/python/test_scan.py b/tests/python/test_scan.py
index 72dd5f415cb30..8fe4037f6ff04 100644
--- a/tests/python/test_scan.py
+++ b/tests/python/test_scan.py
@@ -4,6 +4,10 @@
 
 @test_utils.test(arch=[ti.cuda, ti.vulkan], exclude=[(ti.vulkan, "Darwin")])
 def test_scan():
+    
+    # A global prefix sum wrapper, only need to be initialized once.
+    executor = ti.algorithms.PrefixSumExecutor()
+    
     def test_scan_for_dtype(dtype, N):
         arr = ti.field(dtype, N)
         arr_aux = ti.field(dtype, N)
@@ -15,7 +19,7 @@ def fill():
                 arr_aux[i] = arr[i]
 
         fill()
-        ti._kernels.prefix_sum_inclusive_inplace(arr, N)
+        executor.prefix_sum_inclusive_inplace(arr, N)
 
         cur_sum = 0
         for i in range(N):
diff --git a/tests/python/test_sort.py b/tests/python/test_sort.py
index 1eb3647e038fc..104f81d636fca 100644
--- a/tests/python/test_sort.py
+++ b/tests/python/test_sort.py
@@ -15,7 +15,7 @@ def fill():
                 values[i] = keys[i]
 
         fill()
-        ti._kernels.parallel_sort(keys, values)
+        ti.algorithms.parallel_sort(keys, values)
 
         keys_host = keys.to_numpy()
         values_host = values.to_numpy()

From 9400ddb8c705233d0a61f82900a341f0e36d3d1f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 22 Sep 2022 05:29:32 +0000
Subject: [PATCH 02/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 python/taichi/algorithms.py | 17 +++++++++--------
 tests/python/test_scan.py   |  4 ++--
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/python/taichi/algorithms.py b/python/taichi/algorithms.py
index 2ba68a930c008..2bbe8caf216ef 100644
--- a/python/taichi/algorithms.py
+++ b/python/taichi/algorithms.py
@@ -1,11 +1,12 @@
-from taichi.types.primitive_types import i32
+from taichi._kernels import (blit_from_field_to_field, scan_add_inclusive,
+                             sort_stage, uniform_add, warp_shfl_up_i32)
 from taichi.lang.impl import current_cfg, field
 from taichi.lang.kernel_impl import data_oriented
+from taichi.lang.misc import cuda, vulkan
 from taichi.lang.runtime_ops import sync
 from taichi.lang.simt import subgroup
-from taichi.lang.misc import cuda, vulkan
-from taichi._kernels import sort_stage
-from taichi._kernels import warp_shfl_up_i32, blit_from_field_to_field, scan_add_inclusive, uniform_add
+from taichi.types.primitive_types import i32
+
 
 # Odd-even merge sort
 # References:
@@ -77,11 +78,11 @@ def prefix_sum_inclusive_inplace(self, input_arr, length):
         # Kogge-Stone construction
         for i in range(len(ele_nums) - 1):
             if i == len(ele_nums) - 2:
-                scan_add_inclusive(self.large_arr, ele_nums_pos[i], ele_nums_pos[i + 1],
-                                True, inclusive_add)
+                scan_add_inclusive(self.large_arr, ele_nums_pos[i],
+                                   ele_nums_pos[i + 1], True, inclusive_add)
             else:
-                scan_add_inclusive(self.large_arr, ele_nums_pos[i], ele_nums_pos[i + 1],
-                                False, inclusive_add)
+                scan_add_inclusive(self.large_arr, ele_nums_pos[i],
+                                   ele_nums_pos[i + 1], False, inclusive_add)
 
         for i in range(len(ele_nums) - 3, -1, -1):
             uniform_add(self.large_arr, ele_nums_pos[i], ele_nums_pos[i + 1])
diff --git a/tests/python/test_scan.py b/tests/python/test_scan.py
index 8fe4037f6ff04..fe73b6738c68c 100644
--- a/tests/python/test_scan.py
+++ b/tests/python/test_scan.py
@@ -4,10 +4,10 @@
 
 @test_utils.test(arch=[ti.cuda, ti.vulkan], exclude=[(ti.vulkan, "Darwin")])
 def test_scan():
-    
+
     # A global prefix sum wrapper, only need to be initialized once.
     executor = ti.algorithms.PrefixSumExecutor()
-    
+
     def test_scan_for_dtype(dtype, N):
         arr = ti.field(dtype, N)
         arr_aux = ti.field(dtype, N)

From a9e10c3cd38de2569bd2cc609e6a49ec937dd7d4 Mon Sep 17 00:00:00 2001
From: yucrazing <723284893@qq.com>
Date: Thu, 22 Sep 2022 15:07:56 +0800
Subject: [PATCH 03/12] Refactor PrefixSumExecutor

---
 python/taichi/algorithms.py | 27 +++++++++++++++------------
 tests/python/test_scan.py   | 11 ++++++-----
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/python/taichi/algorithms.py b/python/taichi/algorithms.py
index 2ba68a930c008..d6ec398b46db5 100644
--- a/python/taichi/algorithms.py
+++ b/python/taichi/algorithms.py
@@ -31,39 +31,42 @@ def parallel_sort(keys, values=None):
     print(num_stages)
 
 
-# Parallel Prefix Sum (Scan)
+# Inclusive In-Place's Parallel Prefix Sum (Scan)
 # Ref[0]: https://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/scan/doc/scan.pdf
 # Ref[1]: https://github.com/NVIDIA/cuda-samples/blob/master/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_scan.cu
 @data_oriented
 class PrefixSumExecutor:
-    def __init__(self):
+    def __init__(self, length):
         self.large_arr = None
-        self.sorting_length = -1
+        self.sorting_length = length
 
-    def prefix_sum_inclusive_inplace(self, input_arr, length):
         BLOCK_SZ = 64
         GRID_SZ = int((length + BLOCK_SZ - 1) / BLOCK_SZ)
 
         # Buffer position and length
         # This is a single buffer implementation for ease of aot usage
         ele_num = length
-        ele_nums = [ele_num]
+        self.ele_nums = [ele_num]
         start_pos = 0
-        ele_nums_pos = [start_pos]
+        self.ele_nums_pos = [start_pos]
 
         while ele_num > 1:
             ele_num = int((ele_num + BLOCK_SZ - 1) / BLOCK_SZ)
-            ele_nums.append(ele_num)
+            self.ele_nums.append(ele_num)
             start_pos += BLOCK_SZ * ele_num
-            ele_nums_pos.append(start_pos)
+            self.ele_nums_pos.append(start_pos)
+
+        self.large_arr = field(i32, shape=start_pos)
+
+    def run(self, input_arr):
+
+        length = self.sorting_length
+        ele_nums = self.ele_nums
+        ele_nums_pos = self.ele_nums_pos
 
         if input_arr.dtype != i32:
             raise RuntimeError("Only ti.i32 type is supported for prefix sum.")
 
-        if self.large_arr is None or self.sorting_length != length:
-            self.large_arr = field(i32, shape=start_pos)
-            self.sorting_length = length
-
         if current_cfg().arch == cuda:
             inclusive_add = warp_shfl_up_i32
         elif current_cfg().arch == vulkan:
diff --git a/tests/python/test_scan.py b/tests/python/test_scan.py
index 8fe4037f6ff04..9df9a99c90423 100644
--- a/tests/python/test_scan.py
+++ b/tests/python/test_scan.py
@@ -4,10 +4,6 @@
 
 @test_utils.test(arch=[ti.cuda, ti.vulkan], exclude=[(ti.vulkan, "Darwin")])
 def test_scan():
-    
-    # A global prefix sum wrapper, only need to be initialized once.
-    executor = ti.algorithms.PrefixSumExecutor()
-    
     def test_scan_for_dtype(dtype, N):
         arr = ti.field(dtype, N)
         arr_aux = ti.field(dtype, N)
@@ -19,7 +15,12 @@ def fill():
                 arr_aux[i] = arr[i]
 
         fill()
-        executor.prefix_sum_inclusive_inplace(arr, N)
+
+        # Performing an inclusive in-place's parallel prefix sum,
+        # only one exectutor is needed for a specified sorting length.
+        executor = ti.algorithms.PrefixSumExecutor(N)
+        
+        executor.run(arr)
 
         cur_sum = 0
         for i in range(N):

From 3d62afe4be118948eaff0472430683e70134398a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 22 Sep 2022 07:11:11 +0000
Subject: [PATCH 04/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/python/test_scan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/test_scan.py b/tests/python/test_scan.py
index 9df9a99c90423..55ff669aee5ee 100644
--- a/tests/python/test_scan.py
+++ b/tests/python/test_scan.py
@@ -19,7 +19,7 @@ def fill():
         # Performing an inclusive in-place's parallel prefix sum,
         # only one exectutor is needed for a specified sorting length.
         executor = ti.algorithms.PrefixSumExecutor(N)
-        
+
         executor.run(arr)
 
         cur_sum = 0

From cfabe4f103f388ddb5a900994d55e146bd14664e Mon Sep 17 00:00:00 2001
From: YuZhang <YuCrazing@users.noreply.github.com>
Date: Thu, 22 Sep 2022 15:32:45 +0800
Subject: [PATCH 05/12] Update python/taichi/algorithms.py

Co-authored-by: Ailing  <ailzhang@users.noreply.github.com>
---
 python/taichi/algorithms.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/taichi/algorithms.py b/python/taichi/algorithms.py
index 8ef91195c7197..2b6d3b14ba19d 100644
--- a/python/taichi/algorithms.py
+++ b/python/taichi/algorithms.py
@@ -38,7 +38,6 @@ def parallel_sort(keys, values=None):
 @data_oriented
 class PrefixSumExecutor:
     def __init__(self, length):
-        self.large_arr = None
         self.sorting_length = length
 
         BLOCK_SZ = 64

From b745de4f8fb4e67f014757675ef53725d4c885fd Mon Sep 17 00:00:00 2001
From: YuZhang <YuCrazing@users.noreply.github.com>
Date: Thu, 22 Sep 2022 15:32:52 +0800
Subject: [PATCH 06/12] Update python/taichi/algorithms.py

Co-authored-by: Ailing  <ailzhang@users.noreply.github.com>
---
 python/taichi/algorithms.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/taichi/algorithms.py b/python/taichi/algorithms.py
index 2b6d3b14ba19d..629757b0e41c2 100644
--- a/python/taichi/algorithms.py
+++ b/python/taichi/algorithms.py
@@ -59,7 +59,6 @@ def __init__(self, length):
         self.large_arr = field(i32, shape=start_pos)
 
     def run(self, input_arr):
-
         length = self.sorting_length
         ele_nums = self.ele_nums
         ele_nums_pos = self.ele_nums_pos

From 61c1ef7ddb5d97306f95f037da24b950b6764472 Mon Sep 17 00:00:00 2001
From: yucrazing <723284893@qq.com>
Date: Thu, 22 Sep 2022 15:34:11 +0800
Subject: [PATCH 07/12] Add algorithms to test_api.py && Add docstrings

---
 python/taichi/algorithms.py | 21 ++++++++++++++-------
 tests/python/test_api.py    |  6 +++++-
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/python/taichi/algorithms.py b/python/taichi/algorithms.py
index 8ef91195c7197..d2782ebbe0d7f 100644
--- a/python/taichi/algorithms.py
+++ b/python/taichi/algorithms.py
@@ -8,11 +8,13 @@
 from taichi.types.primitive_types import i32
 
 
-# Odd-even merge sort
-# References:
-# https://developer.nvidia.com/gpugems/gpugems2/part-vi-simulation-and-numerical-algorithms/chapter-46-improved-gpu-sorting
-# https://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
 def parallel_sort(keys, values=None):
+    """Odd-even merge sort
+
+    References:
+        https://developer.nvidia.com/gpugems/gpugems2/part-vi-simulation-and-numerical-algorithms/chapter-46-improved-gpu-sorting
+        https://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
+    """
     N = keys.shape[0]
 
     num_stages = 0
@@ -32,11 +34,16 @@ def parallel_sort(keys, values=None):
     print(num_stages)
 
 
-# Inclusive In-Place's Parallel Prefix Sum (Scan)
-# Ref[0]: https://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/scan/doc/scan.pdf
-# Ref[1]: https://github.com/NVIDIA/cuda-samples/blob/master/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_scan.cu
 @data_oriented
 class PrefixSumExecutor:
+    """Parallel Prefix Sum (Scan) Helper
+
+    Use this helper to perform an inclusive in-place's parallel prefix sum.
+    
+    References:
+        https://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/scan/doc/scan.pdf
+        https://github.com/NVIDIA/cuda-samples/blob/master/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_scan.cu
+    """
     def __init__(self, length):
         self.large_arr = None
         self.sorting_length = length
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 6db245771dd80..9ddc09404915c 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -66,7 +66,7 @@ def _get_expected_matrix_apis():
     'StructField', 'TRACE', 'TaichiAssertionError', 'TaichiCompilationError',
     'TaichiNameError', 'TaichiRuntimeError', 'TaichiRuntimeTypeError',
     'TaichiSyntaxError', 'TaichiTypeError', 'TetMesh', 'Texture', 'TriMesh',
-    'Vector', 'VectorNdarray', 'WARN', 'abs', 'acos', 'activate', 'ad', 'aot',
+    'Vector', 'VectorNdarray', 'WARN', 'abs', 'acos', 'activate', 'ad', 'algorithms', 'aot',
     'append', 'arm64', 'asin', 'assume_in_range', 'atan2', 'atomic_add',
     'atomic_and', 'atomic_max', 'atomic_min', 'atomic_or', 'atomic_sub',
     'atomic_xor', 'axes', 'bit_cast', 'bit_shr', 'block_local',
@@ -93,6 +93,10 @@ def _get_expected_matrix_apis():
     'FwdMode', 'Tape', 'clear_all_gradients', 'grad_for', 'grad_replaced',
     'no_grad'
 ]
+user_api[ti.algorithms] = [
+    'PrefixSumExecutor',
+    'parallel_sort'
+]
 user_api[ti.Field] = [
     'copy_from', 'dtype', 'fill', 'from_numpy', 'from_paddle', 'from_torch',
     'parent', 'shape', 'snode', 'to_numpy', 'to_paddle', 'to_torch'

From 8247a9a0c42a2aa26841821475d6e11064a721e5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 22 Sep 2022 07:36:34 +0000
Subject: [PATCH 08/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 python/taichi/algorithms.py |  2 +-
 tests/python/test_api.py    | 13 +++++--------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/python/taichi/algorithms.py b/python/taichi/algorithms.py
index 5d5bad75a2324..957696b670d35 100644
--- a/python/taichi/algorithms.py
+++ b/python/taichi/algorithms.py
@@ -39,7 +39,7 @@ class PrefixSumExecutor:
     """Parallel Prefix Sum (Scan) Helper
 
     Use this helper to perform an inclusive in-place's parallel prefix sum.
-    
+
     References:
         https://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/scan/doc/scan.pdf
         https://github.com/NVIDIA/cuda-samples/blob/master/Samples/2_Concepts_and_Techniques/shfl_scan/shfl_scan.cu
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 9ddc09404915c..7f90cb5831770 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -66,10 +66,10 @@ def _get_expected_matrix_apis():
     'StructField', 'TRACE', 'TaichiAssertionError', 'TaichiCompilationError',
     'TaichiNameError', 'TaichiRuntimeError', 'TaichiRuntimeTypeError',
     'TaichiSyntaxError', 'TaichiTypeError', 'TetMesh', 'Texture', 'TriMesh',
-    'Vector', 'VectorNdarray', 'WARN', 'abs', 'acos', 'activate', 'ad', 'algorithms', 'aot',
-    'append', 'arm64', 'asin', 'assume_in_range', 'atan2', 'atomic_add',
-    'atomic_and', 'atomic_max', 'atomic_min', 'atomic_or', 'atomic_sub',
-    'atomic_xor', 'axes', 'bit_cast', 'bit_shr', 'block_local',
+    'Vector', 'VectorNdarray', 'WARN', 'abs', 'acos', 'activate', 'ad',
+    'algorithms', 'aot', 'append', 'arm64', 'asin', 'assume_in_range', 'atan2',
+    'atomic_add', 'atomic_and', 'atomic_max', 'atomic_min', 'atomic_or',
+    'atomic_sub', 'atomic_xor', 'axes', 'bit_cast', 'bit_shr', 'block_local',
     'cache_read_only', 'cast', 'cc', 'ceil', 'cos', 'cpu', 'cuda',
     'data_oriented', 'dataclass', 'deactivate', 'deactivate_all_snodes',
     'dx11', 'eig', 'exp', 'experimental', 'extension', 'f16', 'f32', 'f64',
@@ -93,10 +93,7 @@ def _get_expected_matrix_apis():
     'FwdMode', 'Tape', 'clear_all_gradients', 'grad_for', 'grad_replaced',
     'no_grad'
 ]
-user_api[ti.algorithms] = [
-    'PrefixSumExecutor',
-    'parallel_sort'
-]
+user_api[ti.algorithms] = ['PrefixSumExecutor', 'parallel_sort']
 user_api[ti.Field] = [
     'copy_from', 'dtype', 'fill', 'from_numpy', 'from_paddle', 'from_torch',
     'parent', 'shape', 'snode', 'to_numpy', 'to_paddle', 'to_torch'

From dc8df44a607a736239500ca5557aaccbd8693cfb Mon Sep 17 00:00:00 2001
From: yucrazing <723284893@qq.com>
Date: Thu, 22 Sep 2022 15:46:23 +0800
Subject: [PATCH 09/12] Remove unused imports

---
 python/taichi/_kernels.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/taichi/_kernels.py b/python/taichi/_kernels.py
index 201ac4078d929..09abf7db9bf64 100644
--- a/python/taichi/_kernels.py
+++ b/python/taichi/_kernels.py
@@ -5,11 +5,10 @@
 from taichi.lang._ndrange import ndrange
 from taichi.lang.expr import Expr
 from taichi.lang.field import ScalarField
-from taichi.lang.impl import current_cfg, field, grouped, static, static_assert
+from taichi.lang.impl import grouped, static, static_assert
 from taichi.lang.kernel_impl import func, kernel
-from taichi.lang.misc import cuda, loop_config, vulkan
-from taichi.lang.runtime_ops import sync
-from taichi.lang.simt import block, subgroup, warp
+from taichi.lang.misc import loop_config
+from taichi.lang.simt import block, warp
 from taichi.lang.snode import deactivate
 from taichi.types import ndarray_type, texture_type, vector
 from taichi.types.annotations import template

From d913fc7edf311ea803f0f1d3783b63582fe40d29 Mon Sep 17 00:00:00 2001
From: yucrazing <723284893@qq.com>
Date: Thu, 22 Sep 2022 16:46:55 +0800
Subject: [PATCH 10/12] add __all__

---
 python/taichi/algorithms.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/taichi/algorithms.py b/python/taichi/algorithms.py
index 957696b670d35..8af2bfce5e084 100644
--- a/python/taichi/algorithms.py
+++ b/python/taichi/algorithms.py
@@ -96,3 +96,6 @@ def run(self, input_arr):
             uniform_add(self.large_arr, ele_nums_pos[i], ele_nums_pos[i + 1])
 
         blit_from_field_to_field(input_arr, self.large_arr, 0, length)
+
+
+__all__ = ['parallel_sort', 'PrefixSumExecutor']

From 88f40d33172cc3673af0e99d21c85e83e7ec603b Mon Sep 17 00:00:00 2001
From: yucrazing <723284893@qq.com>
Date: Thu, 22 Sep 2022 18:13:53 +0800
Subject: [PATCH 11/12] Remove files into a folder to make ci happy

---
 python/taichi/algorithms/__init__.py                       | 1 +
 python/taichi/{algorithms.py => algorithms/_algorithms.py} | 0
 2 files changed, 1 insertion(+)
 create mode 100644 python/taichi/algorithms/__init__.py
 rename python/taichi/{algorithms.py => algorithms/_algorithms.py} (100%)

diff --git a/python/taichi/algorithms/__init__.py b/python/taichi/algorithms/__init__.py
new file mode 100644
index 0000000000000..78108cf8c10c2
--- /dev/null
+++ b/python/taichi/algorithms/__init__.py
@@ -0,0 +1 @@
+from ._algorithms import *
\ No newline at end of file
diff --git a/python/taichi/algorithms.py b/python/taichi/algorithms/_algorithms.py
similarity index 100%
rename from python/taichi/algorithms.py
rename to python/taichi/algorithms/_algorithms.py

From 2c12fbb98f672d3e43d07850fa0f1d4debc269f4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 22 Sep 2022 10:16:06 +0000
Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 python/taichi/algorithms/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/taichi/algorithms/__init__.py b/python/taichi/algorithms/__init__.py
index 78108cf8c10c2..f13c84f0a7833 100644
--- a/python/taichi/algorithms/__init__.py
+++ b/python/taichi/algorithms/__init__.py
@@ -1 +1 @@
-from ._algorithms import *
\ No newline at end of file
+from ._algorithms import *