Update train script to use native or apex AMP, update to latest timm utils interface, fix sotabench, add resdet50 weights, add csp model defs, cleanup collate fn

rwightman · rwightman · commit e18ad8ef0d61 · 2020-10-02T16:13:47.000-07:00
diff --git a/effdet/bench.py b/effdet/bench.py
@@ -2,6 +2,7 @@
 
 Hacked together by Ross Wightman
 """
+from typing import Optional, Dict
 import torch
 import torch.nn as nn
 from timm.utils import ModelEma
@@ -50,12 +51,15 @@ def _post_process(config, cls_outputs, box_outputs):
 
 @torch.jit.script
 def _batch_detection(
-        batch_size: int, class_out, box_out, anchor_boxes, indices, classes, img_scale, img_size):
+        batch_size: int, class_out, box_out, anchor_boxes, indices, classes,
+        img_scale: Optional[torch.Tensor] = None, img_size: Optional[torch.Tensor] = None):
     batch_detections = []
     # FIXME we may be able to do this as a batch with some tensor reshaping/indexing, PR welcome
     for i in range(batch_size):
+        img_scale_i = None if img_scale is None else img_scale[i]
+        img_size_i = None if img_size is None else img_size[i]
         detections = generate_detections(
-            class_out[i], box_out[i], anchor_boxes, indices[i], classes[i], img_scale[i], img_size[i])
+            class_out[i], box_out[i], anchor_boxes, indices[i], classes[i], img_scale_i, img_size_i)
         batch_detections.append(detections)
     return torch.stack(batch_detections, dim=0)
 
@@ -70,11 +74,14 @@ def __init__(self, model):
             self.config.num_scales, self.config.aspect_ratios,
             self.config.anchor_scale, self.config.image_size)
 
-    def forward(self, x, img_scales, img_size):
+    def forward(self, x, img_info: Dict[str, torch.Tensor] = None):
         class_out, box_out = self.model(x)
         class_out, box_out, indices, classes = _post_process(self.config, class_out, box_out)
+        img_info = img_info or {}
+        img_scale = img_info['img_scale'] if 'img_scale' in img_info else None
+        img_size = img_info['img_size'] if 'img_size' in img_info else None
         return _batch_detection(
-            x.shape[0], class_out, box_out, self.anchors.boxes, indices, classes, img_scales, img_size)
+            x.shape[0], class_out, box_out, self.anchors.boxes, indices, classes, img_scale, img_size)
 
 
 class DetBenchTrain(nn.Module):
@@ -89,7 +96,7 @@ def __init__(self, model):
         self.anchor_labeler = AnchorLabeler(self.anchors, self.config.num_classes, match_threshold=0.5)
         self.loss_fn = DetectionLoss(self.config)
 
-    def forward(self, x, target):
+    def forward(self, x, target: Dict[str, torch.Tensor]):
         class_out, box_out = self.model(x)
         cls_targets, box_targets, num_positives = self.anchor_labeler.batch_label_anchors(
             x.shape[0], target['bbox'], target['cls'])
diff --git a/effdet/config/model_config.py b/effdet/config/model_config.py
@@ -140,7 +140,7 @@ def default_detection_model_configs():
     # My own experimental configs with alternate models, training TBD
     # Note: any 'timm' model in the EfficientDet family can be used as a backbone here.
     resdet50=dict(
-        name='resdet50',  # 'wide'
+        name='resdet50',
         backbone_name='resnet50',
         image_size=640,
         fpn_channels=88,
@@ -150,8 +150,50 @@ def default_detection_model_configs():
         act_type='relu',
         redundant_bias=False,
         separable_conv=False,
-        backbone_args=dict(drop_path_rate=0.1),
-        url='',  # no pretrained weights yet
+        backbone_args=dict(drop_path_rate=0.2),
+        url='https://github.com/rwightman/efficientdet-pytorch/releases/download/v0.1/resdet50_416-08676892.pth',
+    ),
+    cspresdet50=dict(
+        name='cspresdet50',
+        backbone_name='cspresnet50',
+        image_size=640,
+        fpn_channels=88,
+        fpn_cell_repeats=4,
+        box_class_repeats=3,
+        pad_type='',
+        act_type='leaky_relu',
+        redundant_bias=False,
+        separable_conv=False,
+        backbone_args=dict(drop_path_rate=0.2),
+        url='',
+    ),
+    cspresdext50=dict(
+        name='cspresdext50',
+        backbone_name='cspresnext50',
+        image_size=640,
+        fpn_channels=88,
+        fpn_cell_repeats=4,
+        box_class_repeats=3,
+        pad_type='',
+        act_type='leaky_relu',
+        redundant_bias=False,
+        separable_conv=False,
+        backbone_args=dict(drop_path_rate=0.2),
+        url='',
+    ),
+    cspdarkdet53=dict(
+        name='cspdarkdet53',
+        backbone_name='cspdarknet53',
+        image_size=640,
+        fpn_channels=88,
+        fpn_cell_repeats=4,
+        box_class_repeats=3,
+        pad_type='',
+        act_type='leaky_relu',
+        redundant_bias=False,
+        separable_conv=False,
+        backbone_args=dict(drop_path_rate=0.2),
+        url='',
     ),
     efficientdet_w0=dict(
         name='efficientdet_w0',  # 'wide'
diff --git a/effdet/data/loader.py b/effdet/data/loader.py
@@ -11,48 +11,54 @@
 MAX_NUM_INSTANCES = 100
 
 
-class FastCollate:
+class DetectionFastCollate:
 
-    def __init__(self):
-        pass
+    def __init__(self, instance_keys=None, instance_shapes=None, instance_fill=-1, max_instances=MAX_NUM_INSTANCES):
+        instance_keys = instance_keys or {'bbox', 'bbox_ignore', 'cls'}
+        instance_shapes = instance_shapes or dict(
+            bbox=(max_instances, 4), bbox_ignore=(max_instances, 4), cls=(max_instances,))
+        self.instance_info = {k: dict(fill=instance_fill, shape=instance_shapes[k]) for k in instance_keys}
+        self.max_instances = max_instances
 
     def __call__(self, batch):
         batch_size = len(batch)
-
-        # FIXME this needs to be more robust
         target = dict()
-        for k, v in batch[0][1].items():
-            if isinstance(v, np.ndarray):
-                # if a numpy array, assume it relates to object instances, pad to MAX_NUM_INSTANCES
-                target_shape = (batch_size, MAX_NUM_INSTANCES)
-                if len(v.shape) > 1:
-                    target_shape = target_shape + v.shape[1:]
-                target_dtype = torch.float32
+
+        def _get_target(k, v):
+            if k in target:
+                return target[k], k in self.instance_info
+            is_instance = False
+            fill_value = 0
+            if k in self.instance_info:
+                info = self.instance_info[k]
+                is_instance = True
+                fill_value = info['fill']
+                shape = (batch_size,) + info['shape']
+                dtype = torch.float32
             elif isinstance(v, (tuple, list)):
-                # if tuple or list, assume per elem
-                target_shape = (batch_size, len(v))
-                target_dtype = torch.float32 if isinstance(v[0], float) else torch.int32
+                # per batch elem sequence
+                shape = (batch_size, len(v))
+                dtype = torch.float32 if isinstance(v[0], (float, np.floating)) else torch.int32
             else:
-                # scalar, assume per elem
-                target_shape = batch_size,
-                target_dtype = torch.float32 if isinstance(v, float) else torch.int64
-            target[k] = torch.zeros(target_shape, dtype=target_dtype)
-
-        tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
+                # per batch elem scalar
+                shape = batch_size,
+                dtype = torch.float32 if isinstance(v, (float, np.floating)) else torch.int64
+            target_tensor = torch.full(shape, fill_value, dtype=dtype)
+            target[k] = target_tensor
+            return target_tensor, is_instance
+
+        img_tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
         for i in range(batch_size):
-            tensor[i] += torch.from_numpy(batch[i][0])
+            img_tensor[i] += torch.from_numpy(batch[i][0])
             for tk, tv in batch[i][1].items():
-                if isinstance(tv, np.ndarray) and len(tv.shape):
-                    num_elem = min(tv.shape[0], MAX_NUM_INSTANCES)
-                    target[tk][i, 0:num_elem] = torch.from_numpy(tv[0:num_elem])
+                target_tensor, is_instance = _get_target(tk, tv)
+                if is_instance:
+                    num_elem = min(tv.shape[0], self.max_instances)
+                    target_tensor[i, 0:num_elem] = torch.from_numpy(tv[0:num_elem])
                 else:
-                    target[tk][i] = torch.tensor(tv, dtype=target[tk].dtype)
-
-        return tensor, target
-
+                    target_tensor[i] = torch.tensor(tv, dtype=target_tensor.dtype)
 
-def _to_gpu(v):
-    return v.cuda(non_blocking=True) if isinstance(v, torch.Tensor) else v
+        return img_tensor, target
 
 
 class PrefetchLoader:
@@ -81,7 +87,7 @@ def __iter__(self):
             with torch.cuda.stream(stream):
                 next_input = next_input.cuda(non_blocking=True)
                 next_input = next_input.float().sub_(self.mean).div_(self.std)
-                next_target = {k: _to_gpu(v) for k, v in next_target.items()}
+                next_target = {k: v.cuda(non_blocking=True) for k, v in next_target.items()}
                 if self.random_erasing is not None:
                     next_input = self.random_erasing(next_input, next_target)
 
@@ -165,7 +171,7 @@ def create_loader(
         num_workers=num_workers,
         sampler=sampler,
         pin_memory=pin_mem,
-        collate_fn=FastCollate() if use_prefetcher else torch.utils.data.dataloader.default_collate,
+        collate_fn=DetectionFastCollate() if use_prefetcher else torch.utils.data.dataloader.default_collate,
     )
     if use_prefetcher:
         if is_train:
diff --git a/effdet/efficientdet.py b/effdet/efficientdet.py
@@ -252,7 +252,7 @@ def __init__(self, config, feature_info):
         super(BiFpn, self).__init__()
         norm_layer = config.norm_layer or nn.BatchNorm2d
         norm_kwargs = config.norm_kwargs or {}
-        act_layer = get_act_layer(config.act_layer) or _ACT_LAYER
+        act_layer = get_act_layer(config.act_type) or _ACT_LAYER
         self.config = config
         fpn_config = config.fpn_config or get_fpn_config(
             config.fpn_name, min_level=config.min_level, max_level=config.max_level)
@@ -314,7 +314,7 @@ def __init__(self, config, num_outputs):
         super(HeadNet, self).__init__()
         norm_layer = config.norm_layer or nn.BatchNorm2d
         norm_kwargs = config.norm_kwargs or {}
-        act_layer = get_act_layer(config.act_layer) or _ACT_LAYER
+        act_layer = get_act_layer(config.act_type) or _ACT_LAYER
         self.config = config
         num_anchors = len(config.aspect_ratios) * config.num_scales
 
diff --git a/effdet/factory.py b/effdet/factory.py
@@ -22,24 +22,12 @@ def create_model_from_config(
     if pretrained or checkpoint_path:
         pretrained_backbone = False  # no point in loading backbone weights
 
-    # Config overrides, override some config value from args. FIXME need a cleaner mechanism or allow
-    # config defs via files.
-    redundant_bias = kwargs.pop('redundant_bias', None)
-    if redundant_bias is not None:
-        # override config if set to something
-        config.redundant_bias = redundant_bias
-
-    label_smoothing = kwargs.pop('label_smoothing', None)
-    if label_smoothing is not None:
-        config.label_smoothing = label_smoothing
-
-    legacy_focal = kwargs.pop('legacy_focal', None)
-    if legacy_focal is not None:
-        config.legacy_focal = legacy_focal
-
-    jit_loss = kwargs.pop('jit_loss', None)
-    if jit_loss is not None:
-        config.jit_loss = jit_loss
+    # Config overrides, override some config values via kwargs.
+    overrides = ('redundant_bias', 'label_smoothing', 'legacy_focal', 'jit_loss')
+    for ov in overrides:
+        value = kwargs.pop(ov, None)
+        if value is not None:
+            setattr(config, ov, value)
 
     # create the base model
     model = EfficientDet(config, pretrained_backbone=pretrained_backbone, **kwargs)
diff --git a/effdet/loss.py b/effdet/loss.py
@@ -101,7 +101,8 @@ def focal_loss(logits, targets, alpha: float, gamma: float, normalizer, label_sm
     modulating_factor = (1. - p_t) ** gamma
 
     # apply label smoothing for cross_entropy for each entry.
-    targets = targets * (1. - label_smoothing) + .5 * label_smoothing
+    if label_smoothing > 0.:
+        targets = targets * (1. - label_smoothing) + .5 * label_smoothing
     ce = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
 
     # compute the final loss and return
@@ -229,7 +230,7 @@ def loss_fn(
                 alpha=alpha, gamma=gamma, normalizer=num_positives_sum, label_smoothing=label_smoothing)
         cls_loss = cls_loss.view(bs, height, width, -1, num_classes)
         cls_loss = cls_loss * (cls_targets_at_level != -2).unsqueeze(-1)
-        cls_losses.append(cls_loss.sum())
+        cls_losses.append(cls_loss.sum())   # FIXME reference code added a clamp here at some point ...clamp(0, 2))
 
         box_losses.append(_box_loss(
             box_outputs[l].permute(0, 2, 3, 1).float(),
@@ -271,8 +272,6 @@ def forward(
             box_targets: List[torch.Tensor],
             num_positives: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
 
-        #  FIXME I'd like to assign and script the loss fun in the init but deepcopy doesn't work with
-        #  ScriptedFunction/ScriptedModule members right now and deepcopy is required for ModelEma as currently impl
         loss_kwargs = dict(
             num_classes=self.num_classes, alpha=self.alpha, gamma=self.gamma, delta=self.delta,
             box_loss_weight=self.box_loss_weight, label_smoothing=self.label_smoothing, legacy_focal=self.legacy_focal)
diff --git a/sotabench.py b/sotabench.py
@@ -8,8 +8,7 @@
     has_amp = False
 from sotabencheval.object_detection import COCOEvaluator
 from sotabencheval.utils import is_server, extract_archive
-from effdet import create_model
-from effdet.data import DetectionDatset, create_loader
+from effdet import create_model, create_loader, create_dataset
 
 NUM_GPU = 1
 BATCH_SIZE = (128 if has_amp else 64) * NUM_GPU
@@ -42,6 +41,17 @@ def _entry(model_name, paper_model_name, paper_arxiv_id, batch_size=BATCH_SIZE,
 # NOTE For any original PyTorch models, I'll remove from this list when you add to sotabench to
 # avoid overlap and confusion. Please contact me.
 model_list = [
+
+    ## Weights trained by myself or others in PyTorch
+    _entry('resdet50', 'ResDet50', '1911.09070', batch_size=_bs(72),
+           model_desc='Trained in PyTorch with https://github.com/rwightman/efficientdet-pytorch'),
+    _entry('tf_efficientdet_lite0', 'EfficientDet-Lite0', '1911.09070', batch_size=_bs(128),
+           model_desc='Trained in PyTorch with https://github.com/rwightman/efficientdet-pytorch'),
+    _entry('efficientdet_d0', 'EfficientDet-D0', '1911.09070', batch_size=_bs(112),
+           model_desc='Trained in PyTorch with https://github.com/rwightman/efficientdet-pytorch'),
+    _entry('efficientdet_d1', 'EfficientDet-D1', '1911.09070', batch_size=_bs(72),
+           model_desc='Trained in PyTorch with https://github.com/rwightman/efficientdet-pytorch'),
+
     ## Weights ported by myself from other frameworks
     _entry('tf_efficientdet_d0', 'EfficientDet-D0', '1911.09070', batch_size=_bs(112),
            model_desc='Ported from official Google AI Tensorflow weights'),
@@ -59,10 +69,8 @@ def _entry(model_name, paper_model_name, paper_arxiv_id, batch_size=BATCH_SIZE,
            model_desc='Ported from official Google AI Tensorflow weights'),
     _entry('tf_efficientdet_d7', 'EfficientDet-D7', '1911.09070', batch_size=_bs(4),
            model_desc='Ported from official Google AI Tensorflow weights'),
-
-    ## Weights trained by myself in PyTorch
-    _entry('efficientdet_d0', 'EfficientDet-D0', '1911.09070', batch_size=_bs(112),
-           model_desc='Trained in PyTorch with https://github.com/rwightman/efficientdet-pytorch'),
+    # _entry('tf_efficientdet_d7x', 'EfficientDet-D7X', '1911.09070', batch_size=_bs(4),
+    #        model_desc='Ported from official Google AI Tensorflow weights'),
 ]
 
 
@@ -87,14 +95,13 @@ def eval_model(model_name, paper_model_name, paper_arxiv_id, batch_size=64, mode
     else:
         print('AMP not installed, running network in FP32.')
 
-    annotation_path = os.path.join(DATA_ROOT, 'annotations', f'instances_{ANNO_SET}.json')
     evaluator = COCOEvaluator(
         root=DATA_ROOT,
         model_name=paper_model_name,
         model_description=model_description,
         paper_arxiv_id=paper_arxiv_id)
 
-    dataset = DetectionDatset(os.path.join(DATA_ROOT, ANNO_SET), annotation_path)
+    dataset = create_dataset('coco', DATA_ROOT, splits='val')
 
     loader = create_loader(
         dataset,
@@ -106,16 +113,17 @@ def eval_model(model_name, paper_model_name, paper_arxiv_id, batch_size=64, mode
         pin_mem=True)
 
     iterator = tqdm.tqdm(loader, desc="Evaluation", mininterval=5)
+    sample_count = 0
     evaluator.reset_time()
-
     with torch.no_grad():
         for i, (input, target) in enumerate(iterator):
-            output = bench(input, target['img_scale'], target['img_size'])
+            output = bench(input, target)
             output = output.cpu()
-            sample_ids = target['img_id'].cpu()
             results = []
             for index, sample in enumerate(output):
-                image_id = int(sample_ids[index])
+                image_id = int(dataset.parser.img_ids[sample_count])
+                sample[:, 2] -= sample[:, 0]
+                sample[:, 3] -= sample[:, 1]
                 for det in sample:
                     score = float(det[4])
                     if score < .001:  # stop when below this threshold, scores in descending order
@@ -126,6 +134,7 @@ def eval_model(model_name, paper_model_name, paper_arxiv_id, batch_size=64, mode
                         score=score,
                         category_id=int(det[5]))
                     results.append(coco_det)
+                sample_count += 1
             evaluator.add(results)
 
             if evaluator.cache_exists:
diff --git a/train.py b/train.py
diff --git a/validate.py b/validate.py