Remove conditional block in RCNN export onnx (PaddlePaddle#5371)

jerrywgz · web-flow · commit afb3b7a1c784 · 2022-03-18T20:29:15.000+08:00
* support rcnn onnx

* clean code

* update cascade rcnn

* add todo for rpn proposals
diff --git a/ppdet/engine/export_utils.py b/ppdet/engine/export_utils.py
@@ -121,7 +121,7 @@ def _dump_infer_config(config, path, image_shape, model):
     setup_orderdict()
     use_dynamic_shape = True if image_shape[2] == -1 else False
     infer_cfg = OrderedDict({
-        'mode': 'fluid',
+        'mode': 'paddle',
         'draw_threshold': 0.5,
         'metric': config['metric'],
         'use_dynamic_shape': use_dynamic_shape
diff --git a/ppdet/modeling/architectures/cascade_rcnn.py b/ppdet/modeling/architectures/cascade_rcnn.py
@@ -117,8 +117,8 @@ def _forward(self):
                 return bbox_pred, bbox_num, None
             mask_out = self.mask_head(body_feats, bbox, bbox_num, self.inputs)
             origin_shape = self.bbox_post_process.get_origin_shape()
-            mask_pred = self.mask_post_process(mask_out[:, 0, :, :], bbox_pred,
-                                               bbox_num, origin_shape)
+            mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
+                                               origin_shape)
             return bbox_pred, bbox_num, mask_pred
 
     def get_loss(self, ):
diff --git a/ppdet/modeling/architectures/mask_rcnn.py b/ppdet/modeling/architectures/mask_rcnn.py
@@ -115,8 +115,8 @@ def _forward(self):
             bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num,
                                                         im_shape, scale_factor)
             origin_shape = self.bbox_post_process.get_origin_shape()
-            mask_pred = self.mask_post_process(mask_out[:, 0, :, :], bbox_pred,
-                                               bbox_num, origin_shape)
+            mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
+                                               origin_shape)
             return bbox_pred, bbox_num, mask_pred
 
     def get_loss(self, ):
diff --git a/ppdet/modeling/heads/mask_head.py b/ppdet/modeling/heads/mask_head.py
@@ -103,7 +103,7 @@ def forward(self, feats):
 
 @register
 class MaskHead(nn.Layer):
-    __shared__ = ['num_classes']
+    __shared__ = ['num_classes', 'export_onnx']
     __inject__ = ['mask_assigner']
     """
     RCNN mask head
@@ -123,9 +123,11 @@ def __init__(self,
                  roi_extractor=RoIAlign().__dict__,
                  mask_assigner='MaskAssigner',
                  num_classes=80,
-                 share_bbox_feat=False):
+                 share_bbox_feat=False,
+                 export_onnx=False):
         super(MaskHead, self).__init__()
         self.num_classes = num_classes
+        self.export_onnx = export_onnx
 
         self.roi_extractor = roi_extractor
         if isinstance(roi_extractor, dict):
@@ -206,7 +208,7 @@ def forward_test(self,
         rois_num (Tensor): The number of prediction for each batch
         scale_factor (Tensor): The scale factor from origin size to input size
         """
-        if rois.shape[0] == 0:
+        if not self.export_onnx and rois.shape[0] == 0:
             mask_out = paddle.full([1, 1, 1, 1], -1)
         else:
             bbox = [rois[:, 2:]]
@@ -218,19 +220,13 @@ def forward_test(self,
 
             mask_feat = self.head(rois_feat)
             mask_logit = self.mask_fcn_logits(mask_feat)
-            mask_num_class = mask_logit.shape[1]
-            if mask_num_class == 1:
+            if self.num_classes == 1:
                 mask_out = F.sigmoid(mask_logit)
             else:
-                num_masks = mask_logit.shape[0]
-                mask_out = []
-                # TODO: need to optimize gather
-                for i in range(mask_logit.shape[0]):
-                    pred_masks = paddle.unsqueeze(
-                        mask_logit[i, :, :, :], axis=0)
-                    mask = paddle.gather(pred_masks, labels[i], axis=1)
-                    mask_out.append(mask)
-                mask_out = F.sigmoid(paddle.concat(mask_out))
+                num_masks = paddle.shape(mask_logit)[0]
+                index = paddle.arange(num_masks).cast('int32')
+                mask_out = mask_logit[index, labels]
+                mask_out = F.sigmoid(mask_out)
         return mask_out
 
     def forward(self,
diff --git a/ppdet/modeling/layers.py b/ppdet/modeling/layers.py
@@ -363,58 +363,59 @@ def __call__(self, inputs, image):
 @register
 @serializable
 class RCNNBox(object):
-    __shared__ = ['num_classes']
+    __shared__ = ['num_classes', 'export_onnx']
 
     def __init__(self,
                  prior_box_var=[10., 10., 5., 5.],
                  code_type="decode_center_size",
                  box_normalized=False,
-                 num_classes=80):
+                 num_classes=80,
+                 export_onnx=False):
         super(RCNNBox, self).__init__()
         self.prior_box_var = prior_box_var
         self.code_type = code_type
         self.box_normalized = box_normalized
         self.num_classes = num_classes
+        self.export_onnx = export_onnx
 
     def __call__(self, bbox_head_out, rois, im_shape, scale_factor):
         bbox_pred = bbox_head_out[0]
         cls_prob = bbox_head_out[1]
         roi = rois[0]
         rois_num = rois[1]
 
-        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
-        scale_list = []
-        origin_shape_list = []
+        if self.export_onnx:
+            onnx_rois_num_per_im = rois_num[0]
+            origin_shape = paddle.expand(im_shape[0, :],
+                                         [onnx_rois_num_per_im, 2])
 
-        batch_size = 1
-        if isinstance(roi, list):
-            batch_size = len(roi)
         else:
-            batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1])
-        # bbox_pred.shape: [N, C*4]
-        for idx in range(batch_size):
-            roi_per_im = roi[idx]
-            rois_num_per_im = rois_num[idx]
-            expand_im_shape = paddle.expand(im_shape[idx, :],
-                                            [rois_num_per_im, 2])
-            origin_shape_list.append(expand_im_shape)
+            origin_shape_list = []
+            if isinstance(roi, list):
+                batch_size = len(roi)
+            else:
+                batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1])
+
+            # bbox_pred.shape: [N, C*4]
+            for idx in range(batch_size):
+                rois_num_per_im = rois_num[idx]
+                expand_im_shape = paddle.expand(im_shape[idx, :],
+                                                [rois_num_per_im, 2])
+                origin_shape_list.append(expand_im_shape)
 
-        origin_shape = paddle.concat(origin_shape_list)
+            origin_shape = paddle.concat(origin_shape_list)
 
         # bbox_pred.shape: [N, C*4]
         # C=num_classes in faster/mask rcnn(bbox_head), C=1 in cascade rcnn(cascade_head)
         bbox = paddle.concat(roi)
-        if bbox.shape[0] == 0:
-            bbox = paddle.zeros([0, bbox_pred.shape[1]], dtype='float32')
-        else:
-            bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var)
+        bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var)
         scores = cls_prob[:, :-1]
 
         # bbox.shape: [N, C, 4]
         # bbox.shape[1] must be equal to scores.shape[1]
-        bbox_num_class = bbox.shape[1]
-        if bbox_num_class == 1:
-            bbox = paddle.tile(bbox, [1, self.num_classes, 1])
+        total_num = bbox.shape[0]
+        bbox_dim = bbox.shape[-1]
+        bbox = paddle.expand(bbox, [total_num, self.num_classes, bbox_dim])
 
         origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1)
         origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1)
@@ -1422,7 +1423,7 @@ def conv_mixer(
         Seq, ActBn = nn.Sequential, lambda x: Seq(x, nn.GELU(), nn.BatchNorm2D(dim))
         Residual = type('Residual', (Seq, ),
                         {'forward': lambda self, x: self[0](x) + x})
-        return Seq(*[
+        return Seq(* [
             Seq(Residual(
                 ActBn(
                     nn.Conv2D(
diff --git a/ppdet/modeling/post_process.py b/ppdet/modeling/post_process.py
@@ -34,14 +34,16 @@
 
 @register
 class BBoxPostProcess(nn.Layer):
-    __shared__ = ['num_classes']
+    __shared__ = ['num_classes', 'export_onnx']
     __inject__ = ['decode', 'nms']
 
-    def __init__(self, num_classes=80, decode=None, nms=None):
+    def __init__(self, num_classes=80, decode=None, nms=None,
+                 export_onnx=False):
         super(BBoxPostProcess, self).__init__()
         self.num_classes = num_classes
         self.decode = decode
         self.nms = nms
+        self.export_onnx = export_onnx
 
     def forward(self, head_out, rois, im_shape, scale_factor):
         """
@@ -52,6 +54,7 @@ def forward(self, head_out, rois, im_shape, scale_factor):
             rois (tuple): roi and rois_num of rpn_head output.
             im_shape (Tensor): The shape of the input image.
             scale_factor (Tensor): The scale factor of the input image.
+            export_onnx (bool): whether export model to onnx
         Returns:
             bbox_pred (Tensor): The output prediction with shape [N, 6], including
                 labels, scores and bboxes. The size of bboxes are corresponding
@@ -62,9 +65,20 @@ def forward(self, head_out, rois, im_shape, scale_factor):
         if self.nms is not None:
             bboxes, score = self.decode(head_out, rois, im_shape, scale_factor)
             bbox_pred, bbox_num, _ = self.nms(bboxes, score, self.num_classes)
+
         else:
             bbox_pred, bbox_num = self.decode(head_out, rois, im_shape,
                                               scale_factor)
+
+        if self.export_onnx:
+            # add fake box after postprocess when exporting onnx 
+            fake_bboxes = paddle.to_tensor(
+                np.array(
+                    [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32'))
+
+            bbox_pred = paddle.concat([bbox_pred, fake_bboxes])
+            bbox_num = bbox_num + 1
+
         return bbox_pred, bbox_num
 
     def get_pred(self, bboxes, bbox_num, im_shape, scale_factor):
@@ -86,45 +100,55 @@ def get_pred(self, bboxes, bbox_num, im_shape, scale_factor):
             pred_result (Tensor): The final prediction results with shape [N, 6]
                 including labels, scores and bboxes.
         """
-
-        bboxes_list = []
-        bbox_num_list = []
-        id_start = 0
-        fake_bboxes = paddle.to_tensor(
-            np.array(
-                [[-1, 0.0, 0.0, 0.0, 0.0, 0.0]], dtype='float32'))
-        fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))
-
-        # add fake bbox when output is empty for each batch
-        for i in range(bbox_num.shape[0]):
-            if bbox_num[i] == 0:
-                bboxes_i = fake_bboxes
-                bbox_num_i = fake_bbox_num
-            else:
-                bboxes_i = bboxes[id_start:id_start + bbox_num[i], :]
-                bbox_num_i = bbox_num[i]
-                id_start += bbox_num[i]
-            bboxes_list.append(bboxes_i)
-            bbox_num_list.append(bbox_num_i)
-        bboxes = paddle.concat(bboxes_list)
-        bbox_num = paddle.concat(bbox_num_list)
+        if not self.export_onnx:
+            bboxes_list = []
+            bbox_num_list = []
+            id_start = 0
+            fake_bboxes = paddle.to_tensor(
+                np.array(
+                    [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32'))
+            fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))
+
+            # add fake bbox when output is empty for each batch
+            for i in range(bbox_num.shape[0]):
+                if bbox_num[i] == 0:
+                    bboxes_i = fake_bboxes
+                    bbox_num_i = fake_bbox_num
+                else:
+                    bboxes_i = bboxes[id_start:id_start + bbox_num[i], :]
+                    bbox_num_i = bbox_num[i]
+                    id_start += bbox_num[i]
+                bboxes_list.append(bboxes_i)
+                bbox_num_list.append(bbox_num_i)
+            bboxes = paddle.concat(bboxes_list)
+            bbox_num = paddle.concat(bbox_num_list)
 
         origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
 
-        origin_shape_list = []
-        scale_factor_list = []
-        # scale_factor: scale_y, scale_x
-        for i in range(bbox_num.shape[0]):
-            expand_shape = paddle.expand(origin_shape[i:i + 1, :],
-                                         [bbox_num[i], 2])
-            scale_y, scale_x = scale_factor[i][0], scale_factor[i][1]
-            scale = paddle.concat([scale_x, scale_y, scale_x, scale_y])
-            expand_scale = paddle.expand(scale, [bbox_num[i], 4])
-            origin_shape_list.append(expand_shape)
-            scale_factor_list.append(expand_scale)
+        if not self.export_onnx:
+            origin_shape_list = []
+            scale_factor_list = []
+            # scale_factor: scale_y, scale_x
+            for i in range(bbox_num.shape[0]):
+                expand_shape = paddle.expand(origin_shape[i:i + 1, :],
+                                             [bbox_num[i], 2])
+                scale_y, scale_x = scale_factor[i][0], scale_factor[i][1]
+                scale = paddle.concat([scale_x, scale_y, scale_x, scale_y])
+                expand_scale = paddle.expand(scale, [bbox_num[i], 4])
+                origin_shape_list.append(expand_shape)
+                scale_factor_list.append(expand_scale)
+
+            self.origin_shape_list = paddle.concat(origin_shape_list)
+            scale_factor_list = paddle.concat(scale_factor_list)
 
-        self.origin_shape_list = paddle.concat(origin_shape_list)
-        scale_factor_list = paddle.concat(scale_factor_list)
+        else:
+            # simplify the computation for bs=1 when exporting onnx
+            scale_y, scale_x = scale_factor[0][0], scale_factor[0][1]
+            scale = paddle.concat(
+                [scale_x, scale_y, scale_x, scale_y]).unsqueeze(0)
+            self.origin_shape_list = paddle.expand(origin_shape,
+                                                   [bbox_num[0], 2])
+            scale_factor_list = paddle.expand(scale, [bbox_num[0], 4])
 
         # bboxes: [N, 6], label, score, bbox
         pred_label = bboxes[:, 0:1]
@@ -170,19 +194,20 @@ def paste_mask(self, masks, boxes, im_h, im_w):
         """
         Paste the mask prediction to the original image.
         """
-
+        x0_int, y0_int = 0, 0
+        x1_int, y1_int = im_w, im_h
         x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1)
-        masks = paddle.unsqueeze(masks, [0, 1])
-        img_y = paddle.arange(0, im_h, dtype='float32') + 0.5
-        img_x = paddle.arange(0, im_w, dtype='float32') + 0.5
+        N = masks.shape[0]
+        img_y = paddle.arange(y0_int, y1_int) + 0.5
+        img_x = paddle.arange(x0_int, x1_int) + 0.5
         img_y = (img_y - y0) / (y1 - y0) * 2 - 1
         img_x = (img_x - x0) / (x1 - x0) * 2 - 1
-        img_x = paddle.unsqueeze(img_x, [1])
-        img_y = paddle.unsqueeze(img_y, [2])
-        N = boxes.shape[0]
+        # img_x, img_y have shapes (N, w), (N, h)
 
-        gx = paddle.expand(img_x, [N, img_y.shape[1], img_x.shape[2]])
-        gy = paddle.expand(img_y, [N, img_y.shape[1], img_x.shape[2]])
+        gx = img_x[:, None, :].expand(
+            [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]])
+        gy = img_y[:, :, None].expand(
+            [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]])
         grid = paddle.stack([gx, gy], axis=3)
         img_masks = F.grid_sample(masks, grid, align_corners=False)
         return img_masks[:, 0]
@@ -208,19 +233,13 @@ def __call__(self, mask_out, bboxes, bbox_num, origin_shape):
         # TODO: support bs > 1 and mask output dtype is bool
         pred_result = paddle.zeros(
             [num_mask, origin_shape[0][0], origin_shape[0][1]], dtype='int32')
-        if bbox_num == 1 and bboxes[0][0] == -1:
-            return pred_result
-
-        # TODO: optimize chunk paste
-        pred_result = []
-        for i in range(bboxes.shape[0]):
-            im_h, im_w = origin_shape[i][0], origin_shape[i][1]
-            pred_mask = self.paste_mask(mask_out[i], bboxes[i:i + 1, 2:], im_h,
-                                        im_w)
-            pred_mask = pred_mask >= self.binary_thresh
-            pred_mask = paddle.cast(pred_mask, 'int32')
-            pred_result.append(pred_mask)
-        pred_result = paddle.concat(pred_result)
+
+        im_h, im_w = origin_shape[0][0], origin_shape[0][1]
+        pred_mask = self.paste_mask(mask_out[:, None, :, :], bboxes[:, 2:],
+                                    im_h, im_w)
+        pred_mask = pred_mask >= self.binary_thresh
+        pred_result = paddle.cast(pred_mask, 'int32')
+
         return pred_result
 
 
diff --git a/ppdet/modeling/proposal_generator/rpn_head.py b/ppdet/modeling/proposal_generator/rpn_head.py