[Feature] Add ENCNet and SECrossEntropyLoss (PaddlePaddle#1648)

justld · web-flow · commit 874f6b66637e · 2021-12-28T20:01:29.000+08:00
diff --git a/configs/encnet/README.md b/configs/encnet/README.md
@@ -0,0 +1,12 @@
+# ENCNet: Context Encoding for Semantic Segmentation
+
+## Reference
+> Hang Zhang, Kristin Dana, et, al. "Context Encoding for Semantic Segmentation". In Proceedings of the IEEE conference on Computer Vision and Pattern Recognition, pp. 7151-7160. 2018.
+
+## Performance
+
+### Cityscapes
+
+| Model | Backbone | Resolution | Training Iters | mIoU | mIoU (flip) | mIoU (ms+flip) | Links |
+|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
+|ENCNet|ResNet101_vd|1024x512|80000|79.42%|80.02%|-|[model](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/encnet_resnet101_os8_cityscapes_1024x512_80k/model.pdparams) \| [log](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/encnet_resnet101_os8_cityscapes_1024x512_80k/train.log )\| [vdl](https://www.paddlepaddle.org.cn/paddle/visualdl/service/app/index?id=c2b819e6b666e4e50bba4b525f515d41)|
diff --git a/configs/encnet/encnet_resnet101_os8_cityscapes_1024x512_80k.yml b/configs/encnet/encnet_resnet101_os8_cityscapes_1024x512_80k.yml
@@ -0,0 +1,33 @@
+_base_: '../_base_/cityscapes.yml'
+
+batch_size: 2
+iters: 80000
+
+model:
+  type: ENCNet
+  backbone:
+    type: ResNet101_vd
+    output_stride: 8
+    pretrained: https://bj.bcebos.com/paddleseg/dygraph/resnet101_vd_ssld.tar.gz
+  num_codes: 32
+  mid_channels: 512
+  backbone_indices: [1, 2, 3]
+  use_se_loss: True
+  add_lateral: True
+
+optimizer:
+  type: sgd
+  weight_decay: 0.0005
+
+loss:
+  types:
+    - type: CrossEntropyLoss
+    - type: CrossEntropyLoss
+    - type: SECrossEntropyLoss
+  coef: [1, 0.4, 0.2]
+
+lr_scheduler:
+  type: PolynomialDecay
+  learning_rate: 0.01
+  end_lr: 0.0
+  power: 0.9
diff --git a/paddleseg/models/__init__.py b/paddleseg/models/__init__.py
@@ -46,6 +46,7 @@
 from .ginet import GINet
 from .segmenter import *
 from .segnet import SegNet
+from .encnet import ENCNet
 from .hrnet_contrast import HRNetW48Contrast
 from .espnet import ESPNetV2
 from .dmnet import DMNet
diff --git a/paddleseg/models/encnet.py b/paddleseg/models/encnet.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils
+
+
+@manager.MODELS.add_component
+class ENCNet(nn.Layer):
+    """
+    The ENCNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Hang Zhang, Kristin Dana, et, al. "Context Encoding for Semantic Segmentation".
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        backbone_indices (tuple): The values in the tuple indicate the indices of
+            output of backbone.
+        num_codes (int): The number of encoded words. Default: 32.
+        mid_channels (int): The channels of middle layers. Default: 512.
+        use_se_loss (int): Whether use semantic encoding loss. Default: True.
+        add_lateral (int): Whether use lateral convolution layers. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=[1, 2, 3],
+                 num_codes=32,
+                 mid_channels=512,
+                 use_se_loss=True,
+                 add_lateral=False,
+                 pretrained=None):
+        super().__init__()
+        self.add_lateral = add_lateral
+        self.num_codes = num_codes
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        in_channels = [
+            self.backbone.feat_channels[index] for index in backbone_indices
+        ]
+
+        self.bottleneck = layers.ConvBNReLU(
+            in_channels[-1],
+            mid_channels,
+            3,
+            padding=1,
+        )
+        if self.add_lateral:
+            self.lateral_convs = nn.LayerList()
+            for in_ch in in_channels[:-1]:
+                self.lateral_convs.append(
+                    layers.ConvBNReLU(
+                        in_ch,
+                        mid_channels,
+                        1,
+                    ))
+            self.fusion = layers.ConvBNReLU(
+                len(in_channels) * mid_channels,
+                mid_channels,
+                3,
+                padding=1,
+            )
+
+        self.enc_module = EncModule(mid_channels, num_codes)
+        self.head = nn.Conv2D(mid_channels, num_classes, 1)
+
+        self.fcn_head = layers.AuxLayer(self.backbone.feat_channels[2],
+                                        mid_channels, num_classes)
+
+        self.use_se_loss = use_se_loss
+        if use_se_loss:
+            self.se_layer = nn.Linear(mid_channels, num_classes)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, inputs):
+        N, C, H, W = paddle.shape(inputs)
+        feats = self.backbone(inputs)
+        fcn_feat = feats[2]
+
+        feats = [feats[i] for i in self.backbone_indices]
+        feat = self.bottleneck(feats[-1])
+
+        if self.add_lateral:
+            laterals = []
+            for i, lateral_conv in enumerate(self.lateral_convs):
+                laterals.append(
+                    F.interpolate(lateral_conv(feats[i]),
+                                  size=paddle.shape(feat)[2:],
+                                  mode='bilinear',
+                                  align_corners=False))
+            feat = self.fusion(paddle.concat([feat, *laterals], 1))
+        encode_feat, feat = self.enc_module(feat)
+        out = self.head(feat)
+        out = F.interpolate(out,
+                            size=[H, W],
+                            mode='bilinear',
+                            align_corners=False)
+        output = [out]
+        if self.training:
+            fcn_out = self.fcn_head(fcn_feat)
+            fcn_out = F.interpolate(fcn_out,
+                                    size=[H, W],
+                                    mode='bilinear',
+                                    align_corners=False)
+            output.append(fcn_out)
+            if self.use_se_loss:
+                se_out = self.se_layer(encode_feat)
+                output.append(se_out)
+            return output
+        return output
+
+
+class Encoding(nn.Layer):
+    def __init__(self, channels, num_codes):
+        super().__init__()
+        self.channels, self.num_codes = channels, num_codes
+
+        std = 1 / ((channels * num_codes)**0.5)
+        self.codewords = self.create_parameter(
+            shape=(num_codes, channels),
+            default_initializer=nn.initializer.Uniform(-std, std),
+        )
+        self.scale = self.create_parameter(
+            shape=(num_codes, ),
+            default_initializer=nn.initializer.Uniform(-1, 0),
+        )
+        self.channels = channels
+
+    def scaled_l2(self, x, codewords, scale):
+        num_codes, channels = paddle.shape(codewords)
+        reshaped_scale = scale.reshape([1, 1, num_codes])
+        expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1])
+        reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
+
+        scaled_l2_norm = paddle.multiply(
+            reshaped_scale,
+            (expanded_x - reshaped_codewords).pow(2).sum(axis=3))
+        return scaled_l2_norm
+
+    def aggregate(self, assignment_weights, x, codewords):
+        num_codes, channels = paddle.shape(codewords)
+        reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
+        expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1])
+
+        encoded_feat = paddle.multiply(
+            assignment_weights.unsqueeze(3),
+            (expanded_x - reshaped_codewords)).sum(axis=1)
+        encoded_feat = paddle.reshape(encoded_feat,
+                                      [-1, self.num_codes, self.channels])
+        return encoded_feat
+
+    def forward(self, x):
+        x_dims = x.ndim
+        assert x_dims == 4, "The dimension of input tensor must equal 4, but got {}.".format(
+            x_dims)
+        assert paddle.shape(
+            x
+        )[1] == self.channels, "Encoding channels error, excepted {} but got {}.".format(
+            self.channels,
+            paddle.shape(x)[1])
+        batch_size = paddle.shape(x)[0]
+        x = x.reshape([batch_size, self.channels, -1]).transpose([0, 2, 1])
+        assignment_weights = F.softmax(self.scaled_l2(x, self.codewords,
+                                                      self.scale),
+                                       axis=2)
+        encoded_feat = self.aggregate(assignment_weights, x, self.codewords)
+        return encoded_feat
+
+
+class EncModule(nn.Layer):
+    def __init__(self, in_channels, num_codes):
+        super().__init__()
+        self.encoding_project = layers.ConvBNReLU(
+            in_channels,
+            in_channels,
+            1,
+        )
+        self.encoding = nn.Sequential(
+            Encoding(channels=in_channels, num_codes=num_codes),
+            nn.BatchNorm1D(num_codes),
+            nn.ReLU(),
+        )
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, in_channels),
+            nn.Sigmoid(),
+        )
+        self.in_channels = in_channels
+
+    def forward(self, x):
+        encoding_projection = self.encoding_project(x)
+        encoding_feat = self.encoding(encoding_projection)
+
+        encoding_feat = encoding_feat.mean(axis=1)
+        batch_size, _, _, _ = paddle.shape(x)
+
+        gamma = self.fc(encoding_feat)
+        y = gamma.reshape([batch_size, self.in_channels, 1, 1])
+        output = F.relu(x + x * y)
+        return encoding_feat, output
diff --git a/paddleseg/models/losses/__init__.py b/paddleseg/models/losses/__init__.py
@@ -32,3 +32,4 @@
 from .detail_aggregate_loss import DetailAggregateLoss
 from .point_cross_entropy_loss import PointCrossEntropyLoss
 from .pixel_contrast_cross_entropy_loss import PixelContrastCrossEntropyLoss
+from .semantic_encode_cross_entropy_loss import SECrossEntropyLoss
diff --git a/paddleseg/models/losses/semantic_encode_cross_entropy_loss.py b/paddleseg/models/losses/semantic_encode_cross_entropy_loss.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class SECrossEntropyLoss(nn.Layer):
+    """
+    The Semantic Encoding Loss implementation based on PaddlePaddle.
+
+    """
+    def __init__(self, *args, **kwargs):
+        super(SECrossEntropyLoss, self).__init__()
+
+    def forward(self, logit, label):
+        if logit.ndim == 4:
+            logit = logit.squeeze(2).squeeze(3)
+        assert logit.ndim == 2, "The shape of logit should be [N, C, 1, 1] or [N, C], but the logit dim is  {}.".format(
+            logit.ndim)
+
+        batch_size, num_classes = paddle.shape(logit)
+        se_label = paddle.zeros([batch_size, num_classes])
+        for i in range(batch_size):
+            hist = paddle.histogram(label[i],
+                                    bins=num_classes,
+                                    min=0,
+                                    max=num_classes - 1)
+            hist = hist.astype('float32') / hist.sum().astype('float32')
+            se_label[i] = (hist > 0).astype('float32')
+        loss = F.binary_cross_entropy_with_logits(logit, se_label)
+        return loss