[TOPI][CUDA] Roi align v2 (#1)

vinx13 · Jan 16, 2019 · 2514df2 · 2514df2
1 parent a527b58
commit 2514df2
Show file tree

Hide file tree

Showing 5 changed files with 214 additions and 0 deletions.
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
@@ -151,3 +151,52 @@ def schedule_multibox_detection(outs):
 @generic.schedule_roi_align.register(["cuda", "gpu"])
 def schedule_roi_align(outs):
     return schedule_pool(outs, 'NCHW')
+
+@generic.schedule_roi_align_v2.register(["cuda", "gpu"])
+def schedule_roi_align_v2(outs):
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _schedule(PaddedInput, Pool):
+        if isinstance(PaddedInput.op, tvm.tensor.ComputeOp):
+            s[PaddedInput].compute_inline()
+        num_thread = tvm.target.current_target(
+            allow_none=False).max_num_threads
+        if Pool.op in s.outputs:
+            Out = Pool
+            OL = s.cache_write(Pool, "local")
+        else:
+            Out = outs[0].op.output(0)
+            s[Pool].set_scope("local")
+        fused = s[Out].fuse(*s[Out].op.axis)
+        bx, tx = s[Out].split(fused, factor=num_thread)
+        s[Out].bind(bx, tvm.thread_axis("blockIdx.x"))
+        s[Out].bind(tx, tvm.thread_axis("threadIdx.x"))
+        if Pool.op in s.outputs:
+            s[OL].compute_at(s[Out], tx)
+        else:
+            s[Pool].compute_at(s[Out], tx)
+
+    scheduled_ops = []
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
+                    traverse(tensor.op)
+        # schedule pool
+        elif OP.tag.startswith('pool'):
+            PaddedInput = OP.input_tensors[0]
+            Pool = OP.output(0)
+            _schedule(PaddedInput, Pool)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+        scheduled_ops.append(OP)
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/generic/vision.py b/topi/python/topi/generic/vision.py
@@ -157,3 +157,20 @@ def schedule_roi_align(outs):
       The computation schedule for the op.
     """
     return _default_schedule(outs, False)
+
+@tvm.target.generic_func
+def schedule_roi_align_v2(outs):
+    """Schedule for roi_align
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of roi_align
+      in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
diff --git a/topi/python/topi/vision/rcnn/__init__.py b/topi/python/topi/vision/rcnn/__init__.py
@@ -1,3 +1,4 @@
 # pylint: disable=wildcard-import
 """Faster R-CNN and Mask R-CNN operators"""
 from .roi_align import *
+from .roi_align_v2 import *
diff --git a/topi/python/topi/vision/rcnn/roi_align_v2.py b/topi/python/topi/vision/rcnn/roi_align_v2.py
@@ -0,0 +1,91 @@
+# pylint: disable=invalid-name
+"""Roi align v2 in
+   https://github.com/TuSimple/mxnet/blob/master/src/operator/contrib/roi_align_v2-inl.h"""
+
+import tvm
+from ...util import get_const_tuple
+
+
+@tvm.target.generic_func
+def roi_align_v2(data, rois, pooled_size, spatial_scale):
+    """ROI align operator in NCHW layout.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, channel, height, width]
+
+    rois : tvm.Tensor
+        2-D with shape [num_roi, 5]. The last dimension should be in format of
+        [batch_index, w_start, h_start, w_end, h_end]
+
+    pooled_size : int or list/tuple of two ints
+        output size, or [out_height, out_width]
+
+    spatial_scale : float
+        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
+        of total stride in convolutional layers, which should be in range (0.0, 1.0]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [num_roi, channel, pooled_size, pooled_size]
+    """
+    _, channel, height, width = get_const_tuple(data.shape)
+    num_roi, _ = get_const_tuple(rois.shape)
+
+    def _bilinear(i, c, y, x):
+        y_low = y.astype('int32')
+        x_low = x.astype('int32')
+        y_high = tvm.min(tvm.ceil(y).astype('int32'), height - 1)
+        x_high = tvm.min(tvm.ceil(x).astype('int32'), width - 1)
+        y_lerp = y - y_low
+        x_lerp = x - x_low
+        bottom = x_lerp * data[i, c, y_high, x_high] + \
+            (1-x_lerp) * data[i, c, y_high, x_low]
+        top = x_lerp * data[i, c, y_low, x_high] + \
+            (1-x_lerp) * data[i, c, y_low, x_low]
+        return y_lerp * bottom + (1-y_lerp) * top
+
+    def _sample(i, c, ph, pw):
+        roi = rois[i]
+        batch_index = roi[0].astype('int32')
+        roi_start_w = roi[1] * spatial_scale
+        roi_start_h = roi[2] * spatial_scale
+        roi_end_w = roi[3] * spatial_scale
+        roi_end_h = roi[4] * spatial_scale
+
+        roi_h = roi_end_h - roi_start_h
+        roi_w = roi_end_w - roi_start_w
+        roi_h = roi_h
+        roi_w = roi_w
+        bin_h = roi_h / pooled_size
+        bin_w = roi_w / pooled_size
+
+        hstart = ph * bin_h
+        wstart = pw * bin_w
+        hend = (ph + 1) * bin_h
+        wend = (pw + 1) * bin_w
+        hstart = tvm.min(tvm.max(hstart + roi_start_h, 0), height-1)
+        wstart = tvm.min(tvm.max(wstart + roi_start_w, 0), width-1)
+        hend = tvm.min(tvm.max(hend + roi_start_h, 0), height-1)
+        wend = tvm.min(tvm.max(wend + roi_start_w, 0), width-1)
+        non_empty = tvm.all(hstart < hend, wstart < wend)
+
+        def min_value(dtype):
+            return tvm.select(non_empty, tvm.min_value(dtype), tvm.const(0.0, dtype))
+
+        stride_h = (hend - hstart) / 3.0
+        stride_w = (wend - wstart) / 3.0
+        hstart += stride_h
+        wstart += stride_w
+        stride_h = tvm.max(0.01, stride_h)
+        stride_w = tvm.max(0.01, stride_w)
+        _max = tvm.comm_reducer(lambda x, y: tvm.make._OpMax(x, y), min_value, name='max')
+        rh = tvm.reduce_axis((0, tvm.select(non_empty, 2, 0)), 'rh')
+        rw = tvm.reduce_axis((0, tvm.select(non_empty, 2, 0)), 'rw')
+        return _max(_bilinear(batch_index, c, hstart + rh*stride_h, wstart+rw*stride_w),
+                    axis=[rh, rw])
+
+    return tvm.compute((num_roi, channel, pooled_size, pooled_size), _sample,
+                       tag='pool,roi_align_v2')
diff --git a/topi/tests/python/test_roi_align_v2.py b/topi/tests/python/test_roi_align_v2.py
@@ -0,0 +1,56 @@
+"""Test code for vision package"""
+import numpy as np
+import tvm
+import topi
+import math
+
+from topi.vision import ssd, nms
+
+
+def verify_roi_align(batch_size, in_channel, size, num_roi, pooled_size, spatial_scale):
+    data_shape = (batch_size, in_channel, size, size)
+    rois_shape = (num_roi, 5)
+    data=tvm.placeholder(data_shape)
+    rois=tvm.placeholder(rois_shape)
+    np_data = np.random.uniform(size=data_shape).reshape(data_shape).astype('float32') * size
+    np_rois = np.random.uniform(size=rois_shape).astype('float32') * size
+    np_rois[:, 0] = 0
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            out = topi.vision.rcnn.roi_align_v2(data, rois, pooled_size=pooled_size, spatial_scale=spatial_scale)
+            s = topi.generic.schedule_roi_align_v2(out)
+
+        tvm_data = tvm.nd.array(np_data, ctx)
+        tvm_rois = tvm.nd.array(np_rois, ctx)
+        tvm_out = tvm.nd.array(np.zeros((num_roi, in_channel, pooled_size, pooled_size)).astype(out.dtype), ctx=ctx)
+        f = tvm.build(s, [data, rois, out], device)
+        f(tvm_data, tvm_rois, tvm_out)
+
+        import mxnet
+        mx_ctx = mxnet.gpu(0)
+        mx_data = mxnet.nd.array(np_data, mx_ctx)
+        mx_rois = mxnet.nd.array(np_rois, mx_ctx)
+        mx_out = mxnet.nd.contrib.ROIAlign_v2(mx_data, mx_rois, pooled_size=(pooled_size, pooled_size), spatial_scale=spatial_scale)
+        mx_out = mx_out.asnumpy()
+
+        tvm_out = tvm_out.asnumpy()
+
+        np.testing.assert_allclose(tvm_out, mx_out, rtol=1e-3)
+
+    for device in ['cuda', 'llvm']:
+        check_device(device)
+
+
+def test_roi_align_v2():
+    verify_roi_align(1, 1, 14, 64, 7, 1.)
+    verify_roi_align(1, 1, 14, 64, 7, 0.5)
+
+
+if __name__ == "__main__":
+    test_roi_align_v2()