rwightman
diff --git a/‎effdet/anchors.py
+90-120 b/‎effdet/anchors.py
+90-120
diff --git a/‎effdet/bench.py
+14-4 b/‎effdet/bench.py
+14-4
@@ -24,7 +24,7 @@
 This module is borrowed from TPU RetinaNet implementation:
 https://github.com/tensorflow/tpu/blob/master/models/official/retinanet/anchors.py
 """
-from typing import Optional
+from typing import Optional, Tuple, Sequence
 
 import numpy as np
 import torch
@@ -87,83 +87,6 @@ def decode_box_outputs(rel_codes, anchors, output_xyxy: bool=False):
     return out
 
 
-def _generate_anchor_configs(min_level, max_level, num_scales, aspect_ratios):
-    """Generates mapping from output level to a list of anchor configurations.
-
-    A configuration is a tuple of (num_anchors, scale, aspect_ratio).
-
-    Args:
-        min_level: integer number of minimum level of the output feature pyramid.
-
-        max_level: integer number of maximum level of the output feature pyramid.
-
-        num_scales: integer number representing intermediate scales added on each level.
-            For instances, num_scales=2 adds two additional anchor scales [2^0, 2^0.5] on each level.
-
-        aspect_ratios: list of tuples representing the aspect ratio anchors added on each level.
-            For instances, aspect_ratios = [(1, 1), (1.4, 0.7), (0.7, 1.4)] adds three anchors on each level.
-
-    Returns:
-        anchor_configs: a dictionary with keys as the levels of anchors and
-            values as a list of anchor configuration.
-    """
-    anchor_configs = {}
-    for level in range(min_level, max_level + 1):
-        anchor_configs[level] = []
-        for scale_octave in range(num_scales):
-            for aspect in aspect_ratios:
-                anchor_configs[level].append((2 ** level, scale_octave / float(num_scales), aspect))
-    return anchor_configs
-
-
-def _generate_anchor_boxes(image_size, anchor_scale, anchor_configs):
-    """Generates multiscale anchor boxes.
-
-    Args:
-        image_size: integer number of input image size. The input image has the same dimension for
-            width and height. The image_size should be divided by the largest feature stride 2^max_level.
-
-        anchor_scale: float number representing the scale of size of the base
-            anchor to the feature stride 2^level.
-
-        anchor_configs: a dictionary with keys as the levels of anchors and
-            values as a list of anchor configuration.
-
-    Returns:
-        anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all feature levels.
-
-    Raises:
-        ValueError: input size must be the multiple of largest feature stride.
-    """
-    boxes_all = []
-    for _, configs in anchor_configs.items():
-        boxes_level = []
-        for config in configs:
-            stride, octave_scale, aspect = config
-            if image_size % stride != 0:
-                raise ValueError("input size must be divided by the stride.")
-            base_anchor_size = anchor_scale * stride * 2 ** octave_scale
-            anchor_size_x_2 = base_anchor_size * aspect[0] / 2.0
-            anchor_size_y_2 = base_anchor_size * aspect[1] / 2.0
-
-            x = np.arange(stride / 2, image_size, stride)
-            y = np.arange(stride / 2, image_size, stride)
-            xv, yv = np.meshgrid(x, y)
-            xv = xv.reshape(-1)
-            yv = yv.reshape(-1)
-
-            boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2,
-                               yv + anchor_size_y_2, xv + anchor_size_x_2))
-            boxes = np.swapaxes(boxes, 0, 1)
-            boxes_level.append(np.expand_dims(boxes, axis=1))
-        # concat anchors on the same level to the reshape NxAx4
-        boxes_level = np.concatenate(boxes_level, axis=1)
-        boxes_all.append(boxes_level.reshape([-1, 4]))
-
-    anchor_boxes = np.vstack(boxes_all)
-    return anchor_boxes
-
-
 def clip_boxes_xyxy(boxes: torch.Tensor, size: torch.Tensor):
     boxes = boxes.clamp(min=0)
     size = torch.cat([size, size], dim=0)
@@ -247,10 +170,26 @@ def generate_detections(
     return detections
 
 
+def get_feat_sizes(image_size: Tuple[int, int], max_level: int):
+    """Get feat widths and heights for all levels.
+    Args:
+      image_size: a tuple (H, W)
+      max_level: maximum feature level.
+    Returns:
+      feat_sizes: a list of tuples (height, width) for each level.
+    """
+    feat_size = image_size
+    feat_sizes = [feat_size]
+    for _ in range(1, max_level + 1):
+        feat_size = ((feat_size[0] - 1) // 2 + 1, (feat_size[1] - 1) // 2 + 1)
+        feat_sizes.append(feat_size)
+    return feat_sizes
+
+
 class Anchors(nn.Module):
     """RetinaNet Anchors class."""
 
-    def __init__(self, min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size):
+    def __init__(self, min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size: Tuple[int, int]):
         """Constructs multiscale RetinaNet anchors.
 
         Args:
@@ -278,26 +217,77 @@ def __init__(self, min_level, max_level, num_scales, aspect_ratios, anchor_scale
         self.max_level = max_level
         self.num_scales = num_scales
         self.aspect_ratios = aspect_ratios
-        self.anchor_scale = anchor_scale
-        self.image_size = image_size
+        if isinstance(anchor_scale, Sequence):
+            assert len(anchor_scale) == max_level - min_level + 1
+            self.anchor_scales = anchor_scale
+        else:
+            self.anchor_scales = [anchor_scale] * (max_level - min_level + 1)
+
+        assert isinstance(image_size, Sequence) and len(image_size) == 2
+        # FIXME this restriction can likely be relaxed with some additional changes
+        assert image_size[0] % 2 ** max_level == 0, 'Image size must be divisible by 2 ** max_level (128)'
+        assert image_size[1] % 2 ** max_level == 0, 'Image size must be divisible by 2 ** max_level (128)'
+        self.image_size = tuple(image_size)
+        self.feat_sizes = get_feat_sizes(image_size, max_level)
         self.config = self._generate_configs()
         self.register_buffer('boxes', self._generate_boxes())
 
     def _generate_configs(self):
         """Generate configurations of anchor boxes."""
-        return _generate_anchor_configs(self.min_level, self.max_level, self.num_scales, self.aspect_ratios)
+        anchor_configs = {}
+        feat_sizes = self.feat_sizes
+        for level in range(self.min_level, self.max_level + 1):
+            anchor_configs[level] = []
+            for scale_octave in range(self.num_scales):
+                for aspect in self.aspect_ratios:
+                    anchor_configs[level].append(
+                        ((feat_sizes[0][0] // feat_sizes[level][0],
+                          feat_sizes[0][1] // feat_sizes[level][1]),
+                         scale_octave / float(self.num_scales), aspect,
+                         self.anchor_scales[level - self.min_level]))
+        return anchor_configs
 
     def _generate_boxes(self):
         """Generates multiscale anchor boxes."""
-        boxes = _generate_anchor_boxes(self.image_size, self.anchor_scale, self.config)
-        boxes = torch.from_numpy(boxes).float()
-        return boxes
+        boxes_all = []
+        for _, configs in self.config.items():
+            boxes_level = []
+            for config in configs:
+                stride, octave_scale, aspect, anchor_scale = config
+                base_anchor_size_x = anchor_scale * stride[1] * 2 ** octave_scale
+                base_anchor_size_y = anchor_scale * stride[0] * 2 ** octave_scale
+                if isinstance(aspect, Sequence):
+                    aspect_x = aspect[0]
+                    aspect_y = aspect[1]
+                else:
+                    aspect_x = np.sqrt(aspect)
+                    aspect_y = 1.0 / aspect_x
+                anchor_size_x_2 = base_anchor_size_x * aspect_x / 2.0
+                anchor_size_y_2 = base_anchor_size_y * aspect_y / 2.0
+
+                x = np.arange(stride[1] / 2, self.image_size[1], stride[1])
+                y = np.arange(stride[0] / 2, self.image_size[0], stride[0])
+                xv, yv = np.meshgrid(x, y)
+                xv = xv.reshape(-1)
+                yv = yv.reshape(-1)
+
+                boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2,
+                                   yv + anchor_size_y_2, xv + anchor_size_x_2))
+                boxes = np.swapaxes(boxes, 0, 1)
+                boxes_level.append(np.expand_dims(boxes, axis=1))
+
+            # concat anchors on the same level to the reshape NxAx4
+            boxes_level = np.concatenate(boxes_level, axis=1)
+            boxes_all.append(boxes_level.reshape([-1, 4]))
+
+        anchor_boxes = np.vstack(boxes_all)
+        anchor_boxes = torch.from_numpy(anchor_boxes).float()
+        return anchor_boxes
 
     def get_anchors_per_location(self):
         return self.num_scales * len(self.aspect_ratios)
 
 
-#@torch.jit.script
 class AnchorLabeler(object):
     """Labeler for multiscale anchor boxes.
     """
@@ -325,9 +315,6 @@ def __init__(self, anchors, num_classes: int, match_threshold: float = 0.5):
         self.anchors = anchors
         self.match_threshold = match_threshold
         self.num_classes = num_classes
-        self.feat_size = {}
-        for level in range(self.anchors.min_level, self.anchors.max_level + 1):
-            self.feat_size[level] = int(self.anchors.image_size / 2 ** level)
         self.indices_cache = {}
 
     def label_anchors(self, gt_boxes, gt_labels):
@@ -360,44 +347,25 @@ def label_anchors(self, gt_boxes, gt_labels):
         cls_targets, _, box_targets, _, matches = self.target_assigner.assign(anchor_box_list, gt_box_list, gt_labels)
 
         # class labels start from 1 and the background class = -1
-        cls_targets -= 1
-        cls_targets = cls_targets.long()
+        cls_targets = (cls_targets - 1).long()
 
         # Unpack labels.
         """Unpacks an array of cls/box into multiple scales."""
         count = 0
         for level in range(self.anchors.min_level, self.anchors.max_level + 1):
-            feat_size = self.feat_size[level]
-            steps = feat_size ** 2 * self.anchors.get_anchors_per_location()
-            indices = torch.arange(count, count + steps, device=cls_targets.device)
+            feat_size = self.anchors.feat_sizes[level]
+            steps = feat_size[0] * feat_size[1] * self.anchors.get_anchors_per_location()
+            cls_targets_out.append(cls_targets[count:count + steps].view([feat_size[0], feat_size[1], -1]))
+            box_targets_out.append(box_targets[count:count + steps].view([feat_size[0], feat_size[1], -1]))
             count += steps
-            cls_targets_out.append(
-                torch.index_select(cls_targets, 0, indices).view([feat_size, feat_size, -1]))
-            box_targets_out.append(
-                torch.index_select(box_targets, 0, indices).view([feat_size, feat_size, -1]))
 
         num_positives = (matches.match_results != -1).float().sum()
 
         return cls_targets_out, box_targets_out, num_positives
 
-    def _build_indices(self, device):
-        anchors_per_loc = self.anchors.get_anchors_per_location()
-        indices_dict = {}
-        count = 0
-        for level in range(self.anchors.min_level, self.anchors.max_level + 1):
-            feat_size = self.feat_size[level]
-            steps = feat_size ** 2 * anchors_per_loc
-            indices = torch.arange(count, count + steps, device=device)
-            indices_dict[level] = indices
-            count += steps
-        return indices_dict
-
-    def _get_indices(self, device, level):
-        if device not in self.indices_cache:
-            self.indices_cache[device] = self._build_indices(device)
-        return self.indices_cache[device][level]
-
-    def batch_label_anchors(self, batch_size: int, gt_boxes, gt_classes):
+    def batch_label_anchors(self, gt_boxes, gt_classes):
+        batch_size = len(gt_boxes)
+        assert len(gt_classes) == len(gt_boxes)
         num_levels = self.anchors.max_level - self.anchors.min_level + 1
         cls_targets_out = [[] for _ in range(num_levels)]
         box_targets_out = [[] for _ in range(num_levels)]
@@ -416,14 +384,16 @@ def batch_label_anchors(self, batch_size: int, gt_boxes, gt_classes):
 
             # Unpack labels.
             """Unpacks an array of cls/box into multiple scales."""
+            count = 0
             for level in range(self.anchors.min_level, self.anchors.max_level + 1):
                 level_index = level - self.anchors.min_level
-                feat_size = self.feat_size[level]
-                indices = self._get_indices(cls_targets.device, level)
+                feat_size = self.anchors.feat_sizes[level]
+                steps = feat_size[0] * feat_size[1] * self.anchors.get_anchors_per_location()
                 cls_targets_out[level_index].append(
-                    torch.index_select(cls_targets, 0, indices).view([feat_size, feat_size, -1]))
+                    cls_targets[count:count + steps].view([feat_size[0], feat_size[1], -1]))
                 box_targets_out[level_index].append(
-                    torch.index_select(box_targets, 0, indices).view([feat_size, feat_size, -1]))
+                    box_targets[count:count + steps].view([feat_size[0], feat_size[1], -1]))
+                count += steps
                 if last_sample:
                     cls_targets_out[level_index] = torch.stack(cls_targets_out[level_index])
                     box_targets_out[level_index] = torch.stack(box_targets_out[level_index])
 
@@ -85,21 +85,31 @@ def forward(self, x, img_info: Dict[str, torch.Tensor] = None):
 
 
 class DetBenchTrain(nn.Module):
-    def __init__(self, model):
+    def __init__(self, model, no_labeler=False):
         super(DetBenchTrain, self).__init__()
         self.model = model
         self.config = model.config
         self.anchors = Anchors(
             self.config.min_level, self.config.max_level,
             self.config.num_scales, self.config.aspect_ratios,
             self.config.anchor_scale, self.config.image_size)
-        self.anchor_labeler = AnchorLabeler(self.anchors, self.config.num_classes, match_threshold=0.5)
+        self.anchor_labeler = None
+        if not no_labeler:
+            self.anchor_labeler = AnchorLabeler(self.anchors, self.config.num_classes, match_threshold=0.5)
         self.loss_fn = DetectionLoss(self.config)
 
     def forward(self, x, target: Dict[str, torch.Tensor]):
         class_out, box_out = self.model(x)
-        cls_targets, box_targets, num_positives = self.anchor_labeler.batch_label_anchors(
-            x.shape[0], target['bbox'], target['cls'])
+        if self.anchor_labeler is None:
+            # target should contain pre-computed anchor labels
+            assert 'label_num_positives' in target
+            cls_targets = [target[f'label_cls_{l}'] for l in range(self.config.num_levels)]
+            box_targets = [target[f'label_bbox_{l}'] for l in range(self.config.num_levels)]
+            num_positives = target['label_num_positives']
+        else:
+            cls_targets, box_targets, num_positives = self.anchor_labeler.batch_label_anchors(
+                target['bbox'], target['cls'])
+
         loss, class_loss, box_loss = self.loss_fn(class_out, box_out, cls_targets, box_targets, num_positives)
         output = dict(loss=loss, class_loss=class_loss, box_loss=box_loss)
         if not self.training: