Update hype parameter for DeepLab (#1926) (#1928)

qingqing01 · web-flow · commit b192374b4299 · 2019-03-27T17:14:17.000+08:00
* fix some hyper parameters
* Update README
diff --git a/fluid/PaddleCV/deeplabv3+/.gitignore b/fluid/PaddleCV/deeplabv3+/.gitignore
@@ -1,3 +1,6 @@
-deeplabv3plus_xception65_initialize.params
-deeplabv3plus.params
-deeplabv3plus.tar.gz
+*.tgz
+deeplabv3plus_gn_init*
+deeplabv3plus_xception65_initialize*
+*.log
+*.sh
+output*
diff --git a/fluid/PaddleCV/deeplabv3+/README.md b/fluid/PaddleCV/deeplabv3+/README.md
@@ -72,20 +72,19 @@ python train.py --help
 以上命令用于测试训练过程是否正常，仅仅迭代了50次并且使用了1的batch size，如果需要复现
 原论文的实验，请使用以下设置：
 ```
+CUDA_VISIBLE_DEVICES=0 \
 python ./train.py \
-    --batch_size=8 \
+    --batch_size=4 \
     --parallel=True \
     --norm_type=gn \
     --train_crop_size=769 \
-    --total_step=90000 \
+    --total_step=500000 \
     --base_lr=0.001 \
     --init_weights_path=deeplabv3plus_gn_init \
     --save_weights_path=output \
     --dataset_path=$DATASET_PATH
 ```
-如果您的显存不足，可以尝试减小`batch_size`，同时等比例放大`total_step`, 保证相乘的值不变，这得益于Group Norm的特性，改变 `batch_size` 并不会显著影响结果，而且能够节约更多显存, 比如您可以设置`--batch_size=4 --total_step=180000`。
-
-如果您希望使用多卡进行训练，可以同比增加`batch_size`，减小`total_step`, 比如原来单卡训练是`--batch_size=4 --total_step=180000`，使用4卡训练则是`--batch_size=16 --total_step=45000`
+如果您的显存不足，可以尝试减小`batch_size`，同时等比例放大`total_step`, 缩小`base_lr`, 保证相乘的值不变，这得益于Group Norm的特性，改变 `batch_size` 并不会显著影响结果，而且能够节约更多显存, 比如您可以设置`--batch_size=2 --total_step=1000000 --base_lr=0.0005`。
 
 ### 测试
 执行以下命令在`Cityscape`测试数据集上进行测试：
@@ -110,7 +109,6 @@ step: 500, mIoU: 0.7881
 
 |数据集 | norm type | pretrained model | trained model | mean IoU
 |---|---|---|---|---|
-|CityScape | batch norm | [deeplabv3plus_xception65_initialize.tgz](https://paddle-deeplab.bj.bcebos.com/deeplabv3plus_xception65_initialize.tgz) | [deeplabv3plus.tgz](https://paddle-deeplab.bj.bcebos.com/deeplabv3plus.tgz) | 0.7873 |
 |CityScape | group norm | [deeplabv3plus_gn_init.tgz](https://paddle-deeplab.bj.bcebos.com/deeplabv3plus_gn_init.tgz) | [deeplabv3plus_gn.tgz](https://paddle-deeplab.bj.bcebos.com/deeplabv3plus_gn.tgz) | 0.7881 |
 
 ## 参考
diff --git a/fluid/PaddleCV/deeplabv3+/eval.py b/fluid/PaddleCV/deeplabv3+/eval.py
@@ -137,7 +137,4 @@ def load_model():
     all_correct = right.copy()
     mp = (wrong + right) != 0
     miou2 = np.mean((right[mp] * 1.0 / (right[mp] + wrong[mp])))
-    if args.verbose:
-        print('step: %s, mIoU: %s' % (i + 1, miou2), flush=True)
-    else:
-        print('\rstep: %s, mIoU: %s' % (i + 1, miou2), end='\r', flush=True)
+    print('step: %s, mIoU: %s' % (i + 1, miou2))
diff --git a/fluid/PaddleCV/deeplabv3+/reader.py b/fluid/PaddleCV/deeplabv3+/reader.py
@@ -9,7 +9,7 @@
 default_config = {
     "shuffle": True,
     "min_resize": 0.5,
-    "max_resize": 2,
+    "max_resize": 4,
     "crop_size": 769,
 }
 
@@ -90,9 +90,21 @@ def get_img(self):
                 break
         if shape == -1:
             return img, label, ln
-        random_scale = np.random.rand(1) * (self.config['max_resize'] -
-                                            self.config['min_resize']
-                                            ) + self.config['min_resize']
+
+        if np.random.rand() > 0.5:
+            range_l = 1
+            range_r = self.config['max_resize']
+        else:
+            range_l = self.config['min_resize']
+            range_r = 1
+
+        if np.random.rand() > 0.5:
+            assert len(img.shape) == 3 and len(
+                label.shape) == 3, "{} {}".format(img.shape, label.shape)
+            img = img[:, :, ::-1]
+            label = label[:, :, ::-1]
+
+        random_scale = np.random.rand(1) * (range_r - range_l) + range_l
         crop_size = int(shape / random_scale)
         bb = crop_size // 2
 
diff --git a/fluid/PaddleCV/deeplabv3+/train.py b/fluid/PaddleCV/deeplabv3+/train.py
@@ -21,10 +21,10 @@
 add_arg = lambda *args: utility.add_arguments(*args, argparser=parser)
 
 # yapf: disable
-add_arg('batch_size',           int,    2,      "The number of images in each batch during training.")
+add_arg('batch_size',           int,    4,      "The number of images in each batch during training.")
 add_arg('train_crop_size',      int,    769,    "Image crop size during training.")
-add_arg('base_lr',              float,  0.0001, "The base learning rate for model training.")
-add_arg('total_step',           int,    90000,  "Number of the training step.")
+add_arg('base_lr',              float,  0.001,  "The base learning rate for model training.")
+add_arg('total_step',           int,    500000, "Number of the training step.")
 add_arg('init_weights_path',    str,    None,   "Path of the initial weights in paddlepaddle format.")
 add_arg('save_weights_path',    str,    None,   "Path of the saved weights during training.")
 add_arg('dataset_path',         str,    None,   "Cityscape dataset path.")
@@ -39,7 +39,7 @@
 parser.add_argument(
     '--enable_ce',
     action='store_true',
-    help='If set, run the task with continuous evaluation logs.')
+    help='If set, run the task with continuous evaluation logs. Users can ignore this agument.')
 #yapf: enable
 
 @contextlib.contextmanager
@@ -87,7 +87,8 @@ def loss(logit, label):
     label = fluid.layers.reshape(label, [-1, 1])
     label = fluid.layers.cast(label, 'int64')
     label_nignore = fluid.layers.reshape(label_nignore, [-1, 1])
-    loss = fluid.layers.softmax_with_cross_entropy(logit, label, ignore_index=255, numeric_stable_mode=True)
+    logit = fluid.layers.softmax(logit, use_cudnn=False)
+    loss = fluid.layers.cross_entropy(logit, label, ignore_index=255)
     label_nignore.stop_gradient = True
     label.stop_gradient = True
     return loss, label_nignore