Skip to content

Commit

Permalink
New ops for RCNN + old ops improvements for RCNN (apache#16215)
Browse files Browse the repository at this point in the history
* box encode and box decode

seems to work now

bug fix

use template to get rid of if statement

* roi align ignore batchid < 0

* amp multicast support casting to narrowest type

cpplint

cpplint

* add unittest

* address comments

* fix amp_multicast
  • Loading branch information
Jerryzcn authored and aaronmarkham committed Oct 16, 2019
1 parent 02ed060 commit 6186aaf
Show file tree
Hide file tree
Showing 7 changed files with 368 additions and 4 deletions.
278 changes: 278 additions & 0 deletions src/operator/contrib/bounding_box-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,284 @@ void BipartiteMatchingBackward(const nnvm::NodeAttrs& attrs,
});
}


inline bool BoxEncodeShape(const nnvm::NodeAttrs& attrs,
mxnet::ShapeVector *in_attrs,
mxnet::ShapeVector *out_attrs) {
CHECK_EQ(in_attrs->size(), 6U);
CHECK_EQ(out_attrs->size(), 2U);
mxnet::TShape& sshape = (*in_attrs)[0];
mxnet::TShape& mshape = (*in_attrs)[1];
mxnet::TShape& ashape = (*in_attrs)[2];
mxnet::TShape& rshape = (*in_attrs)[3];

CHECK_EQ(sshape.ndim(), 2)
<< "samples shape must have dim == 2, "
<< sshape.ndim() << " provided";

CHECK_GE(mshape.ndim(), 2)
<< "matches shape must have dim == 2, "
<< mshape.ndim() << " provided";

CHECK_GE(ashape.ndim(), 3)
<< "matches shape must have dim == 3, "
<< ashape.ndim() << " provided";
int ldim = ashape[ashape.ndim() - 1];
CHECK_EQ(ldim, 4)
<< "last dimension of anchors must be 4, "
<< ldim << " provided";

CHECK_GE(rshape.ndim(), 3)
<< "refs shape must have dim == 3, "
<< ashape.ndim() << " provided";
ldim = rshape[rshape.ndim() - 1];
CHECK_EQ(ldim, 4)
<< "last dimension of anchors must be 4, "
<< ldim << " provided";

// asign input shape
SHAPE_ASSIGN_CHECK(*in_attrs, 4, mshadow::Shape1(4));
SHAPE_ASSIGN_CHECK(*in_attrs, 5, mshadow::Shape1(4));

// assign output shape
mxnet::TShape oshape = ashape;
SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
SHAPE_ASSIGN_CHECK(*out_attrs, 1, oshape);
return shape_is_known(oshape);
}

struct box_encode {
template<typename DType>
MSHADOW_XINLINE static void Map(index_t i, DType *out_targets, DType *out_masks,
const DType *samples, const DType *matches,
const DType *anchors, const DType *refs,
const DType *means, const DType *stds,
const int m, const int n) {
index_t j = i / n;
index_t match = matches[i];
// xmin: 0, ymin:1, xmax: 2, ymax: 3
// x:0, y:1, w:2, h:3
index_t ref_index = (j * m + match) * 4;
DType ref_xmin = refs[ref_index + 0];
DType ref_ymin = refs[ref_index + 1];
DType ref_width = refs[ref_index + 2] - ref_xmin;
DType ref_height = refs[ref_index + 3] - ref_ymin;
DType ref_x = ref_xmin + ref_width * 0.5;
DType ref_y = ref_ymin + ref_height * 0.5;
index_t a_index = i * 4;
DType a_xmin = anchors[a_index + 0];
DType a_ymin = anchors[a_index + 1];
DType a_width = anchors[a_index + 2] - a_xmin;
DType a_height = anchors[a_index + 3] - a_ymin;
DType a_x = a_xmin + a_width * 0.5;
DType a_y = a_ymin + a_height * 0.5;
DType valid = samples[i] > 0.5 ? 1.0 : 0.0;
out_masks[a_index + 0] = valid;
out_masks[a_index + 1] = valid;
out_masks[a_index + 2] = valid;
out_masks[a_index + 3] = valid;
out_targets[a_index + 0] = valid > static_cast<DType>(0.5) ?
((ref_x - a_x) / a_width - static_cast<DType>(means[0])) /
static_cast<DType>(stds[0]) : static_cast<DType>(0.0);
out_targets[a_index + 1] = valid > static_cast<DType>(0.5) ?
((ref_y - a_y) / a_height - static_cast<DType>(means[1])) /
static_cast<DType>(stds[1]) : static_cast<DType>(0.0);
out_targets[a_index + 2] = valid > static_cast<DType>(0.5) ?
(log(ref_width / a_width) - static_cast<DType>(means[2])) /
static_cast<DType>(stds[2]) : static_cast<DType>(0.0);
out_targets[a_index + 3] = valid > static_cast<DType>(0.5) ?
(log(ref_height / a_height) - static_cast<DType>(means[3])) /
static_cast<DType>(stds[3]) : static_cast<DType>(0.0);
}
};

template<typename xpu>
void BoxEncodeForward(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
using namespace mshadow::expr;
using namespace mxnet_op;
CHECK_EQ(inputs.size(), 6U);
CHECK_EQ(outputs.size(), 2U);
Stream<xpu> *s = ctx.get_stream<xpu>();
// samples, matches, anchors, refs, means, stds
mxnet::TShape anchor_shape = inputs[2].shape_;
int loop_size = anchor_shape.ProdShape(0, 2);
int b = anchor_shape[0];
int n = anchor_shape[1];
int m = inputs[3].shape_[1];
MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
Tensor<xpu, 2, DType> samples = inputs[0]
.get_with_shape<xpu, 2, DType>(Shape2(b, n), s);
Tensor<xpu, 2, DType> matches = inputs[1]
.get_with_shape<xpu, 2, DType>(Shape2(b, n), s);
Tensor<xpu, 3, DType> anchors = inputs[2]
.get_with_shape<xpu, 3, DType>(Shape3(b, n, 4), s);
Tensor<xpu, 3, DType> refs = inputs[3]
.get_with_shape<xpu, 3, DType>(Shape3(b, m, 4), s);
Tensor<xpu, 1, DType> means = inputs[4]
.get_with_shape<xpu, 1, DType>(Shape1(4), s);
Tensor<xpu, 1, DType> stds = inputs[5]
.get_with_shape<xpu, 1, DType>(Shape1(4), s);
Tensor<xpu, 3, DType> out_targets = outputs[0]
.get_with_shape<xpu, 3, DType>(Shape3(b, n, 4), s);
Tensor<xpu, 3, DType> out_masks = outputs[1]
.get_with_shape<xpu, 3, DType>(Shape3(b, n, 4), s);

Kernel<box_encode, xpu>::Launch(s, loop_size, out_targets.dptr_,
out_masks.dptr_, samples.dptr_, matches.dptr_, anchors.dptr_,
refs.dptr_, means.dptr_, stds.dptr_, m, n);
});
}

struct BoxDecodeParam : public dmlc::Parameter<BoxDecodeParam> {
float std0;
float std1;
float std2;
float std3;
float clip;
int format;
DMLC_DECLARE_PARAMETER(BoxDecodeParam) {
DMLC_DECLARE_FIELD(std0).set_default(1.0)
.describe("value to be divided from the 1st encoded values");
DMLC_DECLARE_FIELD(std1).set_default(1.0)
.describe("value to be divided from the 2nd encoded values");
DMLC_DECLARE_FIELD(std2).set_default(1.0)
.describe("value to be divided from the 3rd encoded values");
DMLC_DECLARE_FIELD(std3).set_default(1.0)
.describe("value to be divided from the 4th encoded values");
DMLC_DECLARE_FIELD(clip).set_default(-1.0)
.describe("If larger than 0, bounding box target will be clipped to this value.");
DMLC_DECLARE_FIELD(format).set_default(box_common_enum::kCenter)
.add_enum("corner", box_common_enum::kCorner)
.add_enum("center", box_common_enum::kCenter)
.describe("The box encoding type. \n"
" \"corner\" means boxes are encoded as [xmin, ymin, xmax, ymax],"
" \"center\" means boxes are encodes as [x, y, width, height].");
}
}; // BoxDecodeParam

inline bool BoxDecodeShape(const nnvm::NodeAttrs& attrs,
mxnet::ShapeVector *in_attrs,
mxnet::ShapeVector *out_attrs) {
CHECK_EQ(in_attrs->size(), 2U);
CHECK_EQ(out_attrs->size(), 1U);
mxnet::TShape& dshape = (*in_attrs)[0];
mxnet::TShape& ashape = (*in_attrs)[1];

CHECK_EQ(dshape.ndim(), 3)
<< "data shape must have dim == 3, "
<< dshape.ndim() << " provided";
int ldim = dshape[dshape.ndim() - 1];
CHECK_EQ(ldim, 4)
<< "last dimension of data must be 4, "
<< ldim << " provided";

CHECK_GE(ashape.ndim(), 3)
<< "anchors shape must have dim == 3, "
<< ashape.ndim() << " provided";
ldim = ashape[ashape.ndim() - 1];
CHECK_EQ(ldim, 4)
<< "last dimension of anchors must be 4, "
<< ldim << " provided";

// assign output shape
mxnet::TShape oshape = dshape;
SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
return shape_is_known(oshape);
}

template<int anchor_encode, bool has_clip>
struct box_decode {
template<typename DType>
MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *x,
const DType *anchors, const DType std0,
const DType std1, const DType std2,
const DType std3, const DType clip,
const int n) {
index_t index = i * 4;
index_t a_index = (i % n) * 4;
DType a_x = anchors[a_index + 0];
DType a_y = anchors[a_index + 1];
DType a_width = anchors[a_index + 2];
DType a_height = anchors[a_index + 3];
if (box_common_enum::kCorner == anchor_encode) {
// a_x = xmin, a_y = ymin, a_width = xmax, a_height = ymax
a_width = a_width - a_x;
a_height = a_height - a_y;
a_x = a_x + a_width * 0.5;
a_y = a_y + a_height * 0.5;
}
DType ox = x[index + 0] * std0 * a_width + a_x;
DType oy = x[index + 1] * std1 * a_height + a_y;
DType dw = x[index + 2] * std2;
DType dh = x[index + 3] * std3;
if (has_clip) {
dw = dw < clip ? dw : clip;
dh = dh < clip ? dh : clip;
}
dw = exp(dw);
dh = exp(dh);
DType ow = dw * a_width * 0.5;
DType oh = dh * a_height * 0.5;
out[index + 0] = ox - ow;
out[index + 1] = oy - oh;
out[index + 2] = ox + ow;
out[index + 3] = oy + oh;
}
};

template<typename xpu>
void BoxDecodeForward(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
using namespace mshadow::expr;
using namespace mxnet_op;
CHECK_EQ(inputs.size(), 2U);
CHECK_EQ(outputs.size(), 1U);
Stream<xpu> *s = ctx.get_stream<xpu>();
mxnet::TShape x_shape = inputs[0].shape_;
int b = x_shape[0];
int n = x_shape[1];
int loop_size = b * n;
const BoxDecodeParam& param = nnvm::get<BoxDecodeParam>(attrs.parsed);
MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
Tensor<xpu, 3, DType> data = inputs[0]
.get_with_shape<xpu, 3, DType>(Shape3(b, n, 4), s);
Tensor<xpu, 3, DType> anchors = inputs[1]
.get_with_shape<xpu, 3, DType>(Shape3(1, n, 4), s);
Tensor<xpu, 3, DType> out = outputs[0]
.get_with_shape<xpu, 3, DType>(Shape3(b, n, 4), s);
if (box_common_enum::kCorner == param.format && param.clip > 0.0) {
Kernel<box_decode<box_common_enum::kCorner, true>, xpu>::Launch(s, loop_size,
out.dptr_, data.dptr_, anchors.dptr_, static_cast<DType>(param.std0),
static_cast<DType>(param.std1), static_cast<DType>(param.std2),
static_cast<DType>(param.std3), static_cast<DType>(param.clip), n);
} else if (box_common_enum::kCenter == param.format && param.clip > 0.0) {
Kernel<box_decode<box_common_enum::kCenter, true>, xpu>::Launch(s, loop_size,
out.dptr_, data.dptr_, anchors.dptr_, static_cast<DType>(param.std0),
static_cast<DType>(param.std1), static_cast<DType>(param.std2),
static_cast<DType>(param.std3), static_cast<DType>(param.clip), n);
} else if (box_common_enum::kCorner == param.format && param.clip <= 0.0) {
Kernel<box_decode<box_common_enum::kCorner, false>, xpu>::Launch(s, loop_size,
out.dptr_, data.dptr_, anchors.dptr_, static_cast<DType>(param.std0),
static_cast<DType>(param.std1), static_cast<DType>(param.std2),
static_cast<DType>(param.std3), static_cast<DType>(param.clip), n);
} else {
Kernel<box_decode<box_common_enum::kCenter, false>, xpu>::Launch(s, loop_size,
out.dptr_, data.dptr_, anchors.dptr_, static_cast<DType>(param.std0),
static_cast<DType>(param.std1), static_cast<DType>(param.std2),
static_cast<DType>(param.std3), static_cast<DType>(param.clip), n);
}
});
}

} // namespace op
} // namespace mxnet

Expand Down
43 changes: 43 additions & 0 deletions src/operator/contrib/bounding_box.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ namespace op {
DMLC_REGISTER_PARAMETER(BoxNMSParam);
DMLC_REGISTER_PARAMETER(BoxOverlapParam);
DMLC_REGISTER_PARAMETER(BipartiteMatchingParam);
DMLC_REGISTER_PARAMETER(BoxDecodeParam);

NNVM_REGISTER_OP(_contrib_box_nms)
.add_alias("_contrib_box_non_maximum_suppression")
Expand Down Expand Up @@ -201,5 +202,47 @@ NNVM_REGISTER_OP(_backward_contrib_bipartite_matching)
.set_attr<FCompute>("FCompute<cpu>", BipartiteMatchingBackward<cpu>)
.add_arguments(BipartiteMatchingParam::__FIELDS__());

NNVM_REGISTER_OP(_contrib_box_encode)
.describe(R"doc(Encode bounding boxes training target with normalized center offsets.
Input bounding boxes are using corner type: `x_{min}, y_{min}, x_{max}, y_{max}`.) array
)doc" ADD_FILELINE)
.set_num_inputs(6)
.set_num_outputs(2)
.set_attr<nnvm::FListInputNames>("FListInputNames",
[](const NodeAttrs& attrs) {
return std::vector<std::string>{"samples", "matches", "anchors", "refs", "means", "stds"};
})
.set_attr<mxnet::FInferShape>("FInferShape", BoxEncodeShape)
.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<6, 2>)
.set_attr<FCompute>("FCompute<cpu>", BoxEncodeForward<cpu>)
.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
.add_argument("samples", "NDArray-or-Symbol", "(B, N) value +1 (positive), -1 (negative), "
"0 (ignore)")
.add_argument("matches", "NDArray-or-Symbol", "(B, N) value range [0, M)")
.add_argument("anchors", "NDArray-or-Symbol", "(B, N, 4) encoded in corner")
.add_argument("refs", "NDArray-or-Symbol", "(B, M, 4) encoded in corner")
.add_argument("means", "NDArray-or-Symbol", "(4,) Mean value to be subtracted from encoded values")
.add_argument("stds", "NDArray-or-Symbol", "(4,) Std value to be divided from encoded values");

NNVM_REGISTER_OP(_contrib_box_decode)
.describe(R"doc(Decode bounding boxes training target with normalized center offsets.
Input bounding boxes are using corner type: `x_{min}, y_{min}, x_{max}, y_{max}`
or center type: `x, y, width, height.) array
)doc" ADD_FILELINE)
.set_num_inputs(2)
.set_num_outputs(1)
.set_attr_parser(ParamParser<BoxDecodeParam>)
.set_attr<nnvm::FListInputNames>("FListInputNames",
[](const NodeAttrs& attrs) {
return std::vector<std::string>{"data", "anchors"};
})
.set_attr<mxnet::FInferShape>("FInferShape", BoxDecodeShape)
.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
.set_attr<FCompute>("FCompute<cpu>", BoxDecodeForward<cpu>)
.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
.add_argument("data", "NDArray-or-Symbol", "(B, N, 4) predicted bbox offset")
.add_argument("anchors", "NDArray-or-Symbol", "(1, N, 4) encoded in corner or center")
.add_arguments(BoxDecodeParam::__FIELDS__());

} // namespace op
} // namespace mxnet
7 changes: 7 additions & 0 deletions src/operator/contrib/bounding_box.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,12 @@ NNVM_REGISTER_OP(_contrib_bipartite_matching)

NNVM_REGISTER_OP(_backward_contrib_bipartite_matching)
.set_attr<FCompute>("FCompute<gpu>", BipartiteMatchingBackward<gpu>);

NNVM_REGISTER_OP(_contrib_box_encode)
.set_attr<FCompute>("FCompute<gpu>", BoxEncodeForward<gpu>);

NNVM_REGISTER_OP(_contrib_box_decode)
.set_attr<FCompute>("FCompute<gpu>", BoxDecodeForward<gpu>);

} // namespace op
} // namespace mxnet
11 changes: 9 additions & 2 deletions src/operator/contrib/roi_align.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,10 @@ num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
int roi_batch_ind = 0;
if (roi_cols == 5) {
roi_batch_ind = offset_bottom_rois[0];
if (roi_batch_ind < 0) {
top_data[n] = 0;
continue;
}
offset_bottom_rois++;
}

Expand Down Expand Up @@ -340,6 +344,7 @@ void ROIAlignBackward(
int roi_batch_ind = 0;
if (rois_cols == 5) {
roi_batch_ind = offset_bottom_rois[0];
if (roi_batch_ind < 0) continue;
offset_bottom_rois++;
}

Expand Down Expand Up @@ -520,7 +525,8 @@ NNVM_REGISTER_OP(_contrib_ROIAlign)
.describe(R"code(
This operator takes a 4D feature map as an input array and region proposals as `rois`,
then align the feature map over sub-regions of input and produces a fixed-sized output array.
This operator is typically used in Faster R-CNN & Mask R-CNN networks.
This operator is typically used in Faster R-CNN & Mask R-CNN networks. If roi batchid is less
than 0, it will be ignored, and the corresponding output will be set to 0.
Different from ROI pooling, ROI Align removes the harsh quantization, properly aligning
the extracted features with the input. RoIAlign computes the value of each sampling point
Expand Down Expand Up @@ -594,7 +600,8 @@ He, Kaiming, et al. "Mask R-CNN." ICCV, 2017
return MakeGradNode("_backward_ROIAlign", n, heads, n->attrs.dict);
})
.add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator, a 4D Feature maps")
.add_argument("rois", "NDArray-or-Symbol", "Bounding box coordinates, a 2D array")
.add_argument("rois", "NDArray-or-Symbol", "Bounding box coordinates, a 2D array, "
"if batchid is less than 0, it will be ignored.")
.add_arguments(ROIAlignParam::__FIELDS__());


Expand Down
Loading

0 comments on commit 6186aaf

Please sign in to comment.