V2.0 speedup rpn (open-mmlab#2420)

yhcao6 · web-flow · commit af6e1f861318 · 2020-04-10T22:52:58.000+08:00
* speed inference of rpn

* nms return sorted inds

* add comment

* minor perfect

* rename idxs to inds
diff --git a/mmdet/core/post_processing/bbox_nms.py b/mmdet/core/post_processing/bbox_nms.py
@@ -1,6 +1,6 @@
 import torch
 
-from mmdet.ops.nms import nms_wrapper
+from mmdet.ops.nms import batched_nms
 
 
 def multiclass_nms(multi_bboxes,
@@ -48,29 +48,5 @@ def multiclass_nms(multi_bboxes,
         labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
         return bboxes, labels
 
-    # Modified from https://github.com/pytorch/vision/blob
-    # /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
-    # strategy: in order to perform NMS independently per class.
-    # we add an offset to all the boxes. The offset is dependent
-    # only on the class idx, and is large enough so that boxes
-    # from different classes do not overlap
-    max_coordinate = bboxes.max()
-    offsets = labels.to(bboxes) * (max_coordinate + 1)
-    bboxes_for_nms = bboxes + offsets[:, None]
-    nms_cfg_ = nms_cfg.copy()
-    nms_type = nms_cfg_.pop('type', 'nms')
-    nms_op = getattr(nms_wrapper, nms_type)
-    dets, keep = nms_op(
-        torch.cat([bboxes_for_nms, scores[:, None]], 1), **nms_cfg_)
-    bboxes = bboxes[keep]
-    scores = dets[:, -1]  # soft_nms will modify scores
-    labels = labels[keep]
-
-    if keep.size(0) > max_num:
-        _, inds = scores.sort(descending=True)
-        inds = inds[:max_num]
-        bboxes = bboxes[inds]
-        scores = scores[inds]
-        labels = labels[inds]
-
-    return torch.cat([bboxes, scores[:, None]], 1), labels
+    dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)
+    return dets[:max_num], labels[keep[:max_num]]
diff --git a/mmdet/models/anchor_heads/rpn_head.py b/mmdet/models/anchor_heads/rpn_head.py
@@ -4,7 +4,7 @@
 from mmcv.cnn import normal_init
 
 from mmdet.core import delta2bbox
-from mmdet.ops import nms
+from mmdet.ops import batched_nms
 from ..registry import HEADS
 from .anchor_head import AnchorHead
 
@@ -61,7 +61,12 @@ def get_bboxes_single(self,
                           scale_factor,
                           cfg,
                           rescale=False):
-        mlvl_proposals = []
+        # bboxes from different level should be independent during NMS,
+        # level_ids are used as labels for batched NMS to separate them
+        level_ids = []
+        mlvl_scores = []
+        mlvl_bbox_preds = []
+        mlvl_valid_anchors = []
         for idx in range(len(cls_scores)):
             rpn_cls_score = cls_scores[idx]
             rpn_bbox_pred = bbox_preds[idx]
@@ -79,30 +84,37 @@ def get_bboxes_single(self,
             rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
             anchors = mlvl_anchors[idx]
             if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
-                _, topk_inds = scores.topk(cfg.nms_pre)
+                # sort is faster than topk
+                # _, topk_inds = scores.topk(cfg.nms_pre)
+                ranked_scores, rank_inds = scores.sort(descending=True)
+                topk_inds = rank_inds[:cfg.nms_pre]
+                scores = ranked_scores[:cfg.nms_pre]
                 rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
                 anchors = anchors[topk_inds, :]
-                scores = scores[topk_inds]
-            proposals = delta2bbox(anchors, rpn_bbox_pred, self.target_means,
-                                   self.target_stds, img_shape)
-            if cfg.min_bbox_size > 0:
-                w = proposals[:, 2] - proposals[:, 0]
-                h = proposals[:, 3] - proposals[:, 1]
-                valid_inds = torch.nonzero((w >= cfg.min_bbox_size) &
-                                           (h >= cfg.min_bbox_size)).squeeze()
+            mlvl_scores.append(scores)
+            mlvl_bbox_preds.append(rpn_bbox_pred)
+            mlvl_valid_anchors.append(anchors)
+            level_ids.append(
+                scores.new_full((scores.size(0), ), idx, dtype=torch.long))
+
+        scores = torch.cat(mlvl_scores)
+        anchors = torch.cat(mlvl_valid_anchors)
+        rpn_bbox_pred = torch.cat(mlvl_bbox_preds)
+        proposals = delta2bbox(anchors, rpn_bbox_pred, self.target_means,
+                               self.target_stds, img_shape)
+        ids = torch.cat(level_ids)
+
+        if cfg.min_bbox_size > 0:
+            w = proposals[:, 2] - proposals[:, 0]
+            h = proposals[:, 3] - proposals[:, 1]
+            valid_inds = torch.nonzero((w >= cfg.min_bbox_size)
+                                       & (h >= cfg.min_bbox_size)).squeeze()
+            if valid_inds.sum().item() != len(proposals):
                 proposals = proposals[valid_inds, :]
                 scores = scores[valid_inds]
-            proposals = torch.cat([proposals, scores.unsqueeze(-1)], dim=-1)
-            proposals, _ = nms(proposals, cfg.nms_thr)
-            proposals = proposals[:cfg.nms_post, :]
-            mlvl_proposals.append(proposals)
-        proposals = torch.cat(mlvl_proposals, 0)
-        if cfg.nms_across_levels:
-            proposals, _ = nms(proposals, cfg.nms_thr)
-            proposals = proposals[:cfg.max_num, :]
-        else:
-            scores = proposals[:, 4]
-            num = min(cfg.max_num, proposals.shape[0])
-            _, topk_inds = scores.topk(num)
-            proposals = proposals[topk_inds, :]
-        return proposals
+                ids = ids[valid_inds]
+
+        # TODO: remove the hard coded nms type
+        nms_cfg = dict(type='nms', iou_thr=cfg.nms_thr)
+        dets, keep = batched_nms(proposals, scores, ids, nms_cfg)
+        return dets[:cfg.nms_post]
diff --git a/mmdet/ops/__init__.py b/mmdet/ops/__init__.py
@@ -8,7 +8,7 @@
                   deform_conv, deform_roi_pooling, modulated_deform_conv)
 from .generalized_attention import GeneralizedAttention
 from .masked_conv import MaskedConv2d
-from .nms import nms, soft_nms
+from .nms import batched_nms, nms, soft_nms
 from .non_local import NonLocal2D
 from .norm import build_norm_layer
 from .plugin import build_plugin_layer
@@ -28,5 +28,5 @@
     'MaskedConv2d', 'ContextBlock', 'GeneralizedAttention', 'NonLocal2D',
     'get_compiler_version', 'get_compiling_cuda_version', 'build_conv_layer',
     'ConvModule', 'ConvWS2d', 'conv_ws_2d', 'build_norm_layer', 'Scale',
-    'build_upsample_layer', 'build_plugin_layer'
+    'build_upsample_layer', 'build_plugin_layer', 'batched_nms'
 ]
diff --git a/mmdet/ops/nms/__init__.py b/mmdet/ops/nms/__init__.py
@@ -1,3 +1,3 @@
-from .nms_wrapper import nms, soft_nms
+from .nms_wrapper import batched_nms, nms, soft_nms
 
-__all__ = ['nms', 'soft_nms']
+__all__ = ['nms', 'soft_nms', 'batched_nms']
diff --git a/mmdet/ops/nms/nms_wrapper.py b/mmdet/ops/nms/nms_wrapper.py
@@ -116,3 +116,36 @@ def soft_nms(dets, iou_thr, method='linear', sigma=0.5, min_score=1e-3):
     else:
         return new_dets.numpy().astype(dets.dtype), inds.numpy().astype(
             np.int64)
+
+
+def batched_nms(bboxes, scores, inds, nms_cfg):
+    """Performs non-maximum suppression in a batched fashion.
+
+    Modified from https://github.com/pytorch/vision/blob
+    /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
+    In order to perform NMS independently per class, we add an offset to all
+    the boxes. The offset is dependent only on the class idx, and is large
+    enough so that boxes from different classes do not overlap.
+
+    Arguments:
+        bboxes (torch.Tensor): bboxes in shape (N, 4).
+        scores (torch.Tensor): scores in shape (N, ).
+        inds (torch.Tensor): each index value correspond to a bbox cluster,
+            and NMS will not be applied between elements of different inds,
+            shape (N, ).
+        nms_cfg (dict): specify nms type and other parameters like iou_thr.
+
+    Returns:
+        tuple: kept bboxes and indice.
+    """
+    max_coordinate = bboxes.max()
+    offsets = inds.to(bboxes) * (max_coordinate + 1)
+    bboxes_for_nms = bboxes + offsets[:, None]
+    nms_cfg_ = nms_cfg.copy()
+    nms_type = nms_cfg_.pop('type', 'nms')
+    nms_op = eval(nms_type)
+    dets, keep = nms_op(
+        torch.cat([bboxes_for_nms, scores[:, None]], -1), **nms_cfg_)
+    bboxes = bboxes[keep]
+    scores = dets[:, -1]
+    return torch.cat([bboxes, scores[:, None]], -1), keep
diff --git a/mmdet/ops/nms/src/cuda/nms_kernel.cu b/mmdet/ops/nms/src/cuda/nms_kernel.cu
@@ -132,8 +132,7 @@ at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh) {
 
   THCudaFree(state, mask_dev);
   // TODO improve this part
-  return std::get<0>(order_t.index({
-                       keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
-                         order_t.device(), keep.scalar_type())
-                     }).sort(0, false));
+  return order_t.index({
+      keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
+          order_t.device(), keep.scalar_type())});
 }