|
17 | 17 | import math
|
18 | 18 |
|
19 | 19 | from .. import unique_name
|
| 20 | +from ..framework import Variable |
| 21 | +from ..data_feeder import check_type |
20 | 22 |
|
21 | 23 | __all__ = [
|
22 | 24 | 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
|
23 |
| - 'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay' |
| 25 | + 'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay', 'LinearLrWarmup', |
| 26 | + 'ReduceLROnPlateau' |
24 | 27 | ]
|
25 | 28 |
|
26 | 29 |
|
@@ -619,7 +622,7 @@ class LinearLrWarmup(LearningRateDecay):
|
619 | 622 |
|
620 | 623 | learning_rate = 0.1
|
621 | 624 | warmup_steps = 50
|
622 |
| - start_lr = 1. / 3. |
| 625 | + start_lr = 0 |
623 | 626 | end_lr = 0.1
|
624 | 627 |
|
625 | 628 | with fluid.dygraph.guard():
|
@@ -660,3 +663,193 @@ def step(self):
|
660 | 663 | return self.lr_ratio_before_warmup * self.step_num
|
661 | 664 | else:
|
662 | 665 | return base_lr
|
| 666 | + |
| 667 | + |
| 668 | +class ReduceLROnPlateau(LearningRateDecay): |
| 669 | + """ |
| 670 | + Reduce learning rate when ``loss`` has stopped descending. Models often benefit from reducing the learning rate |
| 671 | + by 2 to 10 times once model performance has no longer improvement. |
| 672 | +
|
| 673 | + The ``loss`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``loss`` |
| 674 | + stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * decay_rate`` . |
| 675 | + (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``loss`` stop ascending for a ``patience`` number |
| 676 | + of epochs, the learning rate will be reduced.) |
| 677 | +
|
| 678 | + In addition, After each reduction, it will wait a ``cooldown`` number of epochs before resuming normal operation. |
| 679 | +
|
| 680 | + Args: |
| 681 | + learning_rate (Variable|float|int): The initial learning rate. It can be set to python float or int number. |
| 682 | + If the type is Variable, it should be 1-D Tensor with shape [1], the data type can be 'float32' or 'float64'. |
| 683 | + mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the |
| 684 | + learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` , the learning |
| 685 | + rate will reduce when ``loss`` stops ascending. Default: ``'min'`` . |
| 686 | + decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . |
| 687 | + It should be less than 1.0. Default: 0.1. |
| 688 | + patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced. |
| 689 | + Default: 10. |
| 690 | + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``. |
| 691 | + threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` . |
| 692 | + This make tiny changes of ``loss`` will be ignored. Default: 1e-4. |
| 693 | + threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss`` |
| 694 | + is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum |
| 695 | + change of ``loss`` is ``threshold`` . Default: ``'rel'`` . |
| 696 | + cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0. |
| 697 | + min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0. |
| 698 | + eps (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than eps, the update is |
| 699 | + ignored. Default: 1e-8. |
| 700 | + dtype (str, optional): The data type used to create the learning rate variable. The data type can be set as |
| 701 | + 'float32', 'float64'. Default: 'float32'. |
| 702 | + |
| 703 | + Returns: |
| 704 | + Reduced learning rate. |
| 705 | +
|
| 706 | + Examples: |
| 707 | + |
| 708 | + .. code-block:: python |
| 709 | +
|
| 710 | + import paddle.fluid as fluid |
| 711 | + import numpy as np |
| 712 | +
|
| 713 | + with fluid.dygraph.guard(): |
| 714 | + x = np.random.uniform(-1, 1, [10, 10]).astype("float32") |
| 715 | + linear = fluid.dygraph.Linear(10, 10) |
| 716 | + input = fluid.dygraph.to_variable(x) |
| 717 | +
|
| 718 | + reduce_lr = fluid.dygraph.ReduceLROnPlateau( |
| 719 | + learning_rate = 1.0, |
| 720 | + decay_rate = 0.5, |
| 721 | + patience = 5, |
| 722 | + verbose = True, |
| 723 | + cooldown = 3) |
| 724 | + adam = fluid.optimizer.Adam( |
| 725 | + learning_rate = reduce_lr, |
| 726 | + parameter_list = linear.parameters()) |
| 727 | +
|
| 728 | + for epoch in range(10): |
| 729 | + total_loss = 0 |
| 730 | + for bath_id in range(5): |
| 731 | + out = linear(input) |
| 732 | + loss = fluid.layers.reduce_mean(out) |
| 733 | + total_loss += loss |
| 734 | + adam.minimize(loss) |
| 735 | + |
| 736 | + avg_loss = total_loss/5 |
| 737 | +
|
| 738 | + # adjust learning rate according to avg_loss |
| 739 | + reduce_lr.step(avg_loss) |
| 740 | + lr = adam.current_step_lr() |
| 741 | + print("current avg_loss is %s, current lr is %s" % (avg_loss.numpy()[0], lr)) |
| 742 | +
|
| 743 | + """ |
| 744 | + |
| 745 | + def __init__(self, |
| 746 | + learning_rate, |
| 747 | + mode='min', |
| 748 | + decay_rate=0.1, |
| 749 | + patience=10, |
| 750 | + verbose=False, |
| 751 | + threshold=1e-4, |
| 752 | + threshold_mode='rel', |
| 753 | + cooldown=0, |
| 754 | + min_lr=0, |
| 755 | + eps=1e-8, |
| 756 | + dtype='float32'): |
| 757 | + super(ReduceLROnPlateau, self).__init__(dtype=dtype) |
| 758 | + mode = mode.lower() |
| 759 | + if mode not in ['min', 'max']: |
| 760 | + raise ValueError('mode ' + mode + ' is unknown!') |
| 761 | + self.mode = mode |
| 762 | + |
| 763 | + if decay_rate >= 1.0: |
| 764 | + raise ValueError( |
| 765 | + 'new_lr = origin_lr * decay_rate and decay_rate should be < 1.0.' |
| 766 | + ) |
| 767 | + self.decay_rate = decay_rate |
| 768 | + |
| 769 | + threshold_mode = threshold_mode.lower() |
| 770 | + if threshold_mode not in ['rel', 'abs']: |
| 771 | + raise ValueError('threshold mode ' + threshold_mode + |
| 772 | + ' is unknown!') |
| 773 | + self.threshold_mode = threshold_mode |
| 774 | + |
| 775 | + check_type(learning_rate, 'learning_rate', (float, int, Variable), |
| 776 | + 'ReduceLROnPlateau') |
| 777 | + if isinstance(learning_rate, (float, int)): |
| 778 | + learning_rate = self.create_lr_var(learning_rate) |
| 779 | + |
| 780 | + self.learning_rate = learning_rate |
| 781 | + self.verbose = verbose |
| 782 | + self.patience = patience |
| 783 | + self.threshold = threshold |
| 784 | + self.threshold_mode = threshold_mode |
| 785 | + self.cooldown = cooldown |
| 786 | + self.min_lr = self.create_lr_var(min_lr) |
| 787 | + self.eps = eps |
| 788 | + |
| 789 | + self.cooldown_counter = 0 |
| 790 | + self.best_loss = None |
| 791 | + self.num_bad_epochs = 0 |
| 792 | + self.epoch = 0 |
| 793 | + |
| 794 | + def __call__(self): |
| 795 | + return self.learning_rate |
| 796 | + |
| 797 | + def step(self, loss): |
| 798 | + """ |
| 799 | + It should be invoked on each epoch. Update the learning rate in optimizer according to ``loss`` . |
| 800 | + The new learning rate will take effect on next call to ``optimizer.minimize`` . |
| 801 | +
|
| 802 | + Args: |
| 803 | + loss (Variable): A ``Variable`` that will be monitored to determine whether the learning rate will reduce. |
| 804 | + If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. It should |
| 805 | + be 1-D Tensor with shape [1]. |
| 806 | + Specially, if ``mode`` has been set to ``'max'`` , the learning rate will reduce when it stops ascending. |
| 807 | + Returns: |
| 808 | + None |
| 809 | + |
| 810 | + Examples: |
| 811 | + Please refer to the example of current LearningRateDecay. |
| 812 | + """ |
| 813 | + |
| 814 | + # loss must be 1-D Tensor with shape [1] |
| 815 | + check_type(loss, 'loss', Variable, 'ReduceLROnPlateau.step') |
| 816 | + assert len(loss.shape) == 1 and loss.shape[0] == 1, "the loss.shape " \ |
| 817 | + "should be (1L,), but the current loss.shape is {}. Maybe that " \ |
| 818 | + "you should call fluid.layers.mean to process it first.".format(loss.shape) |
| 819 | + |
| 820 | + self.epoch += 1 |
| 821 | + if self.cooldown_counter > 0: |
| 822 | + self.cooldown_counter -= 1 |
| 823 | + else: |
| 824 | + if self.best_loss is None or self._is_better(loss, self.best_loss): |
| 825 | + self.best_loss = loss |
| 826 | + self.num_bad_epochs = 0 |
| 827 | + else: |
| 828 | + self.num_bad_epochs += 1 |
| 829 | + |
| 830 | + if self.num_bad_epochs > self.patience: |
| 831 | + from .. import layers |
| 832 | + self.cooldown_counter = self.cooldown |
| 833 | + self.num_bad_epochs = 0 |
| 834 | + new_lr = layers.elementwise_max(self.learning_rate * |
| 835 | + self.decay_rate, self.min_lr) |
| 836 | + if self.learning_rate - new_lr > self.eps: |
| 837 | + if self.verbose: |
| 838 | + print('Epoch {}: reducing learning rate from {} to {}.'. |
| 839 | + format(self.epoch, |
| 840 | + self.learning_rate.numpy()[0], |
| 841 | + new_lr.numpy()[0])) |
| 842 | + self.learning_rate = new_lr |
| 843 | + |
| 844 | + def _is_better(self, current, best): |
| 845 | + if self.mode == 'min' and self.threshold_mode == 'rel': |
| 846 | + return current < best - best * self.threshold |
| 847 | + |
| 848 | + elif self.mode == 'min' and self.threshold_mode == 'abs': |
| 849 | + return current < best - self.threshold |
| 850 | + |
| 851 | + elif self.mode == 'max' and self.threshold_mode == 'rel': |
| 852 | + return current > best + best * self.threshold |
| 853 | + |
| 854 | + else: |
| 855 | + return current > best + self.threshold |
0 commit comments