wanghaoshuang
diff --git a/‎demo/nas/block_sa_nas_mobilenetv2.py
+5-9 b/‎demo/nas/block_sa_nas_mobilenetv2.py
+5-9
diff --git a/‎demo/nas/rl_nas_mobilenetv2.py
+5-8 b/‎demo/nas/rl_nas_mobilenetv2.py
+5-8
diff --git a/‎demo/nas/sa_nas_mobilenetv2.py
+10-16 b/‎demo/nas/sa_nas_mobilenetv2.py
+10-16
diff --git a/‎demo/nas/sanas_darts_space.py
+12-15 b/‎demo/nas/sanas_darts_space.py
+12-15
diff --git a/‎demo/prune/README.md
+13-3 b/‎demo/prune/README.md
+13-3
diff --git a/‎demo/prune/fpgm_mobilenetv1_f-50_train.sh
+2-1 b/‎demo/prune/fpgm_mobilenetv1_f-50_train.sh
+2-1
diff --git a/‎demo/prune/fpgm_mobilenetv2_f-50_train.sh
+2-1 b/‎demo/prune/fpgm_mobilenetv2_f-50_train.sh
+2-1
diff --git a/‎demo/prune/fpgm_resnet34_f-42_train.sh
+2-1 b/‎demo/prune/fpgm_resnet34_f-42_train.sh
+2-1
diff --git a/‎demo/prune/fpgm_resnet34_f-50_train.sh
+2-1 b/‎demo/prune/fpgm_resnet34_f-50_train.sh
+2-1
diff --git a/‎demo/prune/train.py
+22-16 b/‎demo/prune/train.py
+22-16
@@ -13,7 +13,6 @@
 from paddleslim.analysis import flops
 from paddleslim.nas import SANAS
 from paddleslim.common import get_logger
-from optimizer import create_optimizer
 import imagenet_reader
 
 _logger = get_logger(__name__, level=logging.INFO)
@@ -157,15 +156,13 @@ def search_mobilenetv2_block(config, args, image_size):
 
         build_strategy = static.BuildStrategy()
         train_compiled_program = static.CompiledProgram(
-            train_program).with_data_parallel(
-                loss_name=avg_cost.name, build_strategy=build_strategy)
+            train_program, build_strategy=build_strategy)
         for epoch_id in range(args.retain_epoch):
             for batch_id, data in enumerate(train_loader()):
                 fetches = [avg_cost.name]
                 s_time = time.time()
-                outs = exe.run(train_compiled_program,
-                               feed=data,
-                               fetch_list=fetches)[0]
+                outs = exe.run(
+                    train_compiled_program, feed=data, fetch_list=fetches)[0]
                 batch_time = time.time() - s_time
                 if batch_id % 10 == 0:
                     _logger.info(
@@ -175,9 +172,8 @@ def search_mobilenetv2_block(config, args, image_size):
         reward = []
         for batch_id, data in enumerate(val_loader()):
             test_fetches = [avg_cost.name, acc_top1.name, acc_top5.name]
-            batch_reward = exe.run(test_program,
-                                   feed=data,
-                                   fetch_list=test_fetches)
+            batch_reward = exe.run(
+                test_program, feed=data, fetch_list=test_fetches)
             reward_avg = np.mean(np.array(batch_reward), axis=1)
             reward.append(reward_avg)
 
 
@@ -141,15 +141,13 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
 
         build_strategy = static.BuildStrategy()
         train_compiled_program = static.CompiledProgram(
-            train_program).with_data_parallel(
-                loss_name=avg_cost.name, build_strategy=build_strategy)
+            train_program, build_strategy=build_strategy)
         for epoch_id in range(args.retain_epoch):
             for batch_id, data in enumerate(train_loader()):
                 fetches = [avg_cost.name]
                 s_time = time.time()
-                outs = exe.run(train_compiled_program,
-                               feed=data,
-                               fetch_list=fetches)[0]
+                outs = exe.run(
+                    train_compiled_program, feed=data, fetch_list=fetches)[0]
                 batch_time = time.time() - s_time
                 if batch_id % 10 == 0:
                     _logger.info(
@@ -161,9 +159,8 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
             test_fetches = [
                 test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
             ]
-            batch_reward = exe.run(test_program,
-                                   feed=data,
-                                   fetch_list=test_fetches)
+            batch_reward = exe.run(
+                test_program, feed=data, fetch_list=test_fetches)
             reward_avg = np.mean(np.array(batch_reward), axis=1)
             reward.append(reward_avg)
 
 
@@ -134,15 +134,13 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
 
         build_strategy = static.BuildStrategy()
         train_compiled_program = static.CompiledProgram(
-            train_program).with_data_parallel(
-                loss_name=avg_cost.name, build_strategy=build_strategy)
+            train_program, build_strategy=build_strategy)
         for epoch_id in range(args.retain_epoch):
             for batch_id, data in enumerate(train_loader()):
                 fetches = [avg_cost.name]
                 s_time = time.time()
-                outs = exe.run(train_compiled_program,
-                               feed=data,
-                               fetch_list=fetches)[0]
+                outs = exe.run(
+                    train_compiled_program, feed=data, fetch_list=fetches)[0]
                 batch_time = time.time() - s_time
                 if batch_id % 10 == 0:
                     _logger.info(
@@ -154,9 +152,8 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
             test_fetches = [
                 test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
             ]
-            batch_reward = exe.run(test_program,
-                                   feed=data,
-                                   fetch_list=test_fetches)
+            batch_reward = exe.run(
+                test_program, feed=data, fetch_list=test_fetches)
             reward_avg = np.mean(np.array(batch_reward), axis=1)
             reward.append(reward_avg)
 
@@ -223,15 +220,13 @@ def test_search_result(tokens, image_size, args, config):
 
     build_strategy = static.BuildStrategy()
     train_compiled_program = static.CompiledProgram(
-        train_program).with_data_parallel(
-            loss_name=avg_cost.name, build_strategy=build_strategy)
+        train_program, build_strategy=build_strategy)
     for epoch_id in range(args.retain_epoch):
         for batch_id, data in enumerate(train_loader()):
             fetches = [avg_cost.name]
             s_time = time.time()
-            outs = exe.run(train_compiled_program,
-                           feed=data,
-                           fetch_list=fetches)[0]
+            outs = exe.run(
+                train_compiled_program, feed=data, fetch_list=fetches)[0]
             batch_time = time.time() - s_time
             if batch_id % 10 == 0:
                 _logger.info(
@@ -243,9 +238,8 @@ def test_search_result(tokens, image_size, args, config):
             test_fetches = [
                 test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
             ]
-            batch_reward = exe.run(test_program,
-                                   feed=data,
-                                   fetch_list=test_fetches)
+            batch_reward = exe.run(
+                test_program, feed=data, fetch_list=test_fetches)
             reward_avg = np.mean(np.array(batch_reward), axis=1)
             reward.append(reward_avg)
 
 
@@ -119,8 +119,8 @@ def train(main_prog, exe, epoch_id, train_loader, fetch_list, args):
                     [[drop_path_probility * epoch_id / args.retain_epoch]
                      for i in range(args.batch_size)]).astype(np.float32)
                 drop_path_mask = 1 - np.random.binomial(
-                    1, drop_path_prob[0],
-                    size=[args.batch_size, 20, 4, 2]).astype(np.float32)
+                    1, drop_path_prob[0], size=[args.batch_size, 20, 4, 2
+                                                ]).astype(np.float32)
                 feed.append({
                     "image": image,
                     "label": label,
@@ -195,8 +195,8 @@ def search(config, args, image_size, is_server=True):
 
         current_params = count_parameters_in_MB(
             train_program.global_block().all_parameters(), 'cifar10')
-        _logger.info('step: {}, current_params: {}M'.format(step,
-                                                            current_params))
+        _logger.info(
+            'step: {}, current_params: {}M'.format(step, current_params))
         if current_params > float(3.77):
             continue
 
@@ -222,9 +222,7 @@ def search(config, args, image_size, is_server=True):
 
         build_strategy = static.BuildStrategy()
         train_compiled_program = static.CompiledProgram(
-            train_program).with_data_parallel(
-                loss_name=train_fetch_list[0].name,
-                build_strategy=build_strategy)
+            train_program, build_strategy=build_strategy)
 
         valid_top1_list = []
         for epoch_id in range(args.retain_epoch):
@@ -234,8 +232,8 @@ def search(config, args, image_size, is_server=True):
                 step, epoch_id, train_top1))
             valid_top1 = valid(test_program, exe, epoch_id, test_loader,
                                test_fetch_list, args)
-            _logger.info("TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id,
-                                                                   valid_top1))
+            _logger.info(
+                "TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id, valid_top1))
             valid_top1_list.append(valid_top1)
         sa_nas.reward(float(valid_top1_list[-1] + valid_top1_list[-2]) / 2)
 
@@ -276,19 +274,18 @@ def final_test(config, args, image_size, token=None):
 
     build_strategy = static.BuildStrategy()
     train_compiled_program = static.CompiledProgram(
-        train_program).with_data_parallel(
-            loss_name=train_fetch_list[0].name, build_strategy=build_strategy)
+        train_program, build_strategy=build_strategy)
 
     valid_top1_list = []
     for epoch_id in range(args.retain_epoch):
         train_top1 = train(train_compiled_program, exe, epoch_id, train_loader,
                            train_fetch_list, args)
-        _logger.info("TRAIN: Epoch {}, train_acc {:.6f}".format(epoch_id,
-                                                                train_top1))
+        _logger.info(
+            "TRAIN: Epoch {}, train_acc {:.6f}".format(epoch_id, train_top1))
         valid_top1 = valid(test_program, exe, epoch_id, test_loader,
                            test_fetch_list, args)
-        _logger.info("TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id,
-                                                               valid_top1))
+        _logger.info(
+            "TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id, valid_top1))
         valid_top1_list.append(valid_top1)
 
         output_dir = os.path.join('darts_output', str(epoch_id))
 
@@ -34,6 +34,7 @@ tar -xf MobileNetV1_pretrained.tar
 
 通过以下命令启动裁剪任务：
 
+- 单卡启动：
 ```
 export CUDA_VISIBLE_DEVICES=0
 python train.py \
@@ -43,9 +44,18 @@ python train.py \
 --criterion "l1_norm"
 ```
 
-其中，`model`用于指定待裁剪的模型。`pruned_ratio`用于指定各个卷积层通道数被裁剪的比例。`data`选项用于指定使用的数据集。
-`criterion` 选项用于指定所使用的剪裁算法策略，现在支持`l1_norm`, `bn_scale`, `geometry_median`。默认为`l1_norm`。可以
-设置该参数以改变剪裁算法策略。该目录下的四个shell脚本文件是在ResNet34, MobileNetV1, MobileNetV2等三个模型上进行的四组
+- 多卡启动：
+```
+export CUDA_VISIBLE_DEVICES=0, 1
+python -m paddle.distributed.launch train.py \
+--model "MobileNet" \
+--pruned_ratio 0.31 \
+--data "mnist"  \
+--criterion "l1_norm" \
+--fleet
+```
+
+其中，`model`用于指定待裁剪的模型。`pruned_ratio`用于指定各个卷积层通道数被裁剪的比例。`data`选项用于指定使用的数据集。`criterion` 选项用于指定所使用的剪裁算法策略，现在支持`l1_norm`, `bn_scale`, `geometry_median`，默认为`l1_norm`。`fleet` 用于开启多卡训练，在多卡启动时需要调用该参数。该目录下的四个shell脚本文件是在ResNet34, MobileNetV1, MobileNetV2等三个模型上进行的四组
 `criterion`设置为`geometry_median`的实验，可以直接运行脚本文件启动剪裁实验。
 
 执行`python train.py --help`查看更多选项。
 
@@ -1,7 +1,7 @@
 #!/bin/bash  
 export CUDA_VISIBLE_DEVICES=0,1
 export FLAGS_fraction_of_gpu_memory_to_use=0.98
-python train.py \
+python -m paddle.distributed.launch train.py \
     --model="MobileNet" \
     --pretrained_model="/workspace/models/MobileNetV1_pretrained" \
     --data="imagenet" \
@@ -14,4 +14,5 @@ python train.py \
     --lr_strategy="piecewise_decay" \
     --criterion="geometry_median" \
     --model_path="./fpgm_mobilenetv1_models" \
+    --fleet \
     2>&1 | tee fpgm_mobilenetv1_train.log
@@ -1,7 +1,7 @@
 #!/bin/bash  
 export CUDA_VISIBLE_DEVICES=0,1
 export FLAGS_fraction_of_gpu_memory_to_use=0.98
-python train.py \
+python -m paddle.distributed.launch train.py \
     --model="MobileNetV2" \
     --pretrained_model="/workspace/models/MobileNetV2_pretrained" \
     --data="imagenet" \
@@ -14,4 +14,5 @@ python train.py \
     --lr_strategy="piecewise_decay" \
     --criterion="geometry_median" \
     --model_path="./fpgm_mobilenetv2_models" \
+    --fleet \
     2>&1 | tee fpgm_mobilenetv2_train.log
@@ -1,12 +1,13 @@
 #!/bin/bash  
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 export FLAGS_fraction_of_gpu_memory_to_use=0.98
-python train.py \
+python -m paddle.distributed.launch train.py \
     --model="ResNet34" \
     --pretrained_model="/workspace/models/ResNet34_pretrained" \
     --data="imagenet" \
     --pruned_ratio=0.25 \
     --lr_strategy="cosine_decay" \
     --criterion="geometry_median" \
     --model_path="./fpgm_resnet34_025_120_models" \
+    --fleet \
     2>&1 | tee fpgm_resnet025_120_train.log
@@ -1,7 +1,7 @@
 #!/bin/bash  
 export CUDA_VISIBLE_DEVICES=0,1
 export FLAGS_fraction_of_gpu_memory_to_use=0.98
-python train.py \
+python -m paddle.distributed.launch train.py \
     --model="ResNet34" \
     --pretrained_model="/workspace/models/ResNet34_pretrained" \
     --data="imagenet" \
@@ -14,4 +14,5 @@ python train.py \
     --lr_strategy="piecewise_decay" \
     --criterion="geometry_median" \
     --model_path="./fpgm_resnet34_models" \
+    --fleet \
     2>&1 | tee fpgm_resnet03_train.log
@@ -15,6 +15,7 @@
 import models
 from utility import add_arguments, print_arguments
 import paddle.vision.transforms as T
+from paddle.distributed import fleet
 
 _logger = get_logger(__name__, level=logging.INFO)
 
@@ -40,6 +41,7 @@
 add_arg('criterion',        str, "l1_norm",         "The prune criterion to be used, support l1_norm and batch_norm_scale.")
 add_arg('save_inference',   bool, False,                "Whether to save inference model.")
 add_arg('ce_test',          bool, False, "Whether to CE test.")
+parser.add_argument('fleet',            action='store_true',  help="Whether to turn on distributed training.")
 # yapf: enable
 
 model_list = models.__all__
@@ -96,6 +98,8 @@ def create_optimizer(args, step_per_epoch):
 
 
 def compress(args):
+    if args.fleet:
+        fleet.init(is_collective=True)
 
     num_workers = 4
     shuffle = True
@@ -130,8 +134,8 @@ def compress(args):
     else:
         raise ValueError("{} is not supported.".format(args.data))
     image_shape = [int(m) for m in image_shape.split(",")]
-    assert args.model in model_list, "{} is not in lists: {}".format(args.model,
-                                                                     model_list)
+    assert args.model in model_list, "{} is not in lists: {}".format(
+        args.model, model_list)
     places = paddle.static.cuda_places(
     ) if args.use_gpu else paddle.static.cpu_places()
     place = places[0]
@@ -140,13 +144,16 @@ def compress(args):
         name='image', shape=[None] + image_shape, dtype='float32')
     label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
     batch_size_per_card = int(args.batch_size / len(places))
+    sampler = paddle.io.DistributedBatchSampler(
+        train_dataset,
+        shuffle=shuffle,
+        drop_last=True,
+        batch_size=batch_size_per_card)
     train_loader = paddle.io.DataLoader(
         train_dataset,
         places=places,
         feed_list=[image, label],
-        drop_last=True,
-        batch_size=batch_size_per_card,
-        shuffle=shuffle,
+        batch_sampler=sampler,
         return_list=False,
         use_shared_memory=True,
         num_workers=num_workers)
@@ -171,6 +178,8 @@ def compress(args):
     acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)
     val_program = paddle.static.default_main_program().clone(for_test=True)
     opt, learning_rate = create_optimizer(args, step_per_epoch)
+    if args.fleet:
+        opt = fleet.distributed_optimizer(opt)
     opt.minimize(avg_cost)
 
     exe.run(paddle.static.default_startup_program())
@@ -180,8 +189,8 @@ def compress(args):
         def if_exist(var):
             return os.path.exists(os.path.join(args.pretrained_model, var.name))
 
-        _logger.info("Load pretrained model from {}".format(
-            args.pretrained_model))
+        _logger.info(
+            "Load pretrained model from {}".format(args.pretrained_model))
         paddle.static.load(paddle.static.default_main_program(),
                            args.pretrained_model, exe)
 
@@ -247,13 +256,10 @@ def train(epoch, program):
         place=place)
     _logger.info("FLOPs after pruning: {}".format(flops(pruned_program)))
 
-    build_strategy = paddle.static.BuildStrategy()
-    exec_strategy = paddle.static.ExecutionStrategy()
-    train_program = paddle.static.CompiledProgram(
-        pruned_program).with_data_parallel(
-            loss_name=avg_cost.name,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy)
+    if args.fleet:
+        train_program = paddle.static.CompiledProgram(pruned_program)
+    else:
+        train_program = pruned_program
 
     for i in range(args.num_epochs):
         train(i, train_program)
@@ -268,8 +274,8 @@ def train(epoch, program):
                 infer_model_path, [image], [out],
                 exe,
                 program=pruned_val_program)
-            _logger.info("Saved inference model into [{}]".format(
-                infer_model_path))
+            _logger.info(
+                "Saved inference model into [{}]".format(infer_model_path))
 
 
 def main():