-
Notifications
You must be signed in to change notification settings - Fork 101
/
Copy pathprotocol_core.py
1055 lines (904 loc) · 53.1 KB
/
protocol_core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""This module defines the base Optimization Protocol classes. The classes defined herein are not
intended for direct use, but are rather parent classes to those defined in
:mod:`hyperparameter_hunter.optimization.backends.skopt.protocols`
Related
-------
:mod:`hyperparameter_hunter.optimization.backends.skopt.protocols`
Defines the optimization classes that are intended for direct use. All classes defined in
:mod:`hyperparameter_hunter.optimization.backends.skopt.protocols` should be descendants of
:class:`~hyperparameter_hunter.optimization.protocol_core.BaseOptPro`
:mod:`hyperparameter_hunter.result_reader`
Used to locate result files for Experiments that are similar to the current optimization
constraints, and produce data to learn from in the case of :class:`SKOptPro`
:mod:`hyperparameter_hunter.space`
Defines the child classes of `hyperparameter_hunter.space.Dimension`, which are used to define
the hyperparameters to optimize
:mod:`hyperparameter_hunter.utils.optimization_utils`:
Provides utility functions for locating saved Experiments that fit within the constraints
currently being optimized"""
##################################################
# Import Own Assets
##################################################
from hyperparameter_hunter import __version__
from hyperparameter_hunter.algorithm_handlers import (
identify_algorithm,
identify_algorithm_hyperparameters,
)
from hyperparameter_hunter.experiments import CVExperiment
from hyperparameter_hunter.feature_engineering import FeatureEngineer
from hyperparameter_hunter.i_o.exceptions import (
EnvironmentInactiveError,
EnvironmentInvalidError,
RepeatedExperimentError,
DeprecatedWarning,
)
from hyperparameter_hunter.i_o.reporting import OptimizationReporter
from hyperparameter_hunter.i_o.result_reader import finder_selector
from hyperparameter_hunter.compat.keras_helper import reinitialize_callbacks
from hyperparameter_hunter.compat.keras_optimization_helper import (
keras_prep_workflow,
link_choice_ids,
)
from hyperparameter_hunter.metrics import get_formatted_target_metric
from hyperparameter_hunter.optimization.backends.skopt.engine import Optimizer, cook_estimator
from hyperparameter_hunter.settings import G, TEMP_MODULES_DIR_PATH
from hyperparameter_hunter.space.dimensions import RejectedOptional
from hyperparameter_hunter.space.space_core import Space
from hyperparameter_hunter.utils.boltons_utils import get_path
from hyperparameter_hunter.utils.general_utils import deep_restricted_update, subdict
from hyperparameter_hunter.utils.optimization_utils import get_choice_dimensions, dimension_subset
from hyperparameter_hunter.utils.version_utils import Deprecated
##################################################
# Import Miscellaneous Assets
##################################################
from abc import ABCMeta, abstractmethod
from datetime import datetime
from inspect import currentframe, getframeinfo
from os import walk, remove, rmdir
from os.path import abspath
from typing import Any, Dict
from warnings import warn
##################################################
# Import Learning Assets
##################################################
from skopt.callbacks import check_callback
# noinspection PyProtectedMember
from skopt.utils import eval_callbacks
try:
from tensorflow.keras import backend as K
except ImportError:
K = None
class OptProMeta(type):
"""Metaclass to accurately set :attr:`source_script` for its descendants even if the original
call was the product of scripts calling other scripts that eventually instantiated an
optimization protocol"""
@classmethod
def __prepare__(mcs, name, bases, **kwargs):
"""Prepare `namespace` to include :attr:`source_script`"""
namespace = dict(source_script=None)
return namespace
def __call__(cls, *args, **kwargs):
"""Set the instance's :attr:`source_script` to the absolute path of the file that
instantiated the OptimizationProtocol"""
setattr(cls, "source_script", abspath(getframeinfo(currentframe().f_back)[0]))
return super().__call__(*args, **kwargs)
class MergedOptProMeta(OptProMeta, ABCMeta):
"""Metaclass to combine :class:`OptProMeta`, and `ABCMeta`"""
pass
class BaseOptPro(metaclass=MergedOptProMeta):
source_script: str
def __init__(
self,
target_metric=None,
iterations=1,
verbose=1,
read_experiments=True,
reporter_parameters=None,
warn_on_re_ask=False,
):
"""Base class for intermediate base optimization protocol classes
There are two important methods for all :class:`BaseOptPro` descendants that should be
invoked after initialization:
1. :meth:`~hyperparameter_hunter.optimization.protocol_core.BaseOptPro.forge_experiment`
2. :meth:`~hyperparameter_hunter.optimization.protocol_core.BaseOptPro.go`
Parameters
----------
target_metric: Tuple, default=("oof", <:attr:`environment.Environment.metrics`[0]>)
Rarely necessary to explicitly provide this, as the default is usually sufficient. Path
denoting the metric to be used to compare Experiment performance. The first value
should be one of ["oof", "holdout", "in_fold"]. The second value should be the name of
a metric being recorded according to :attr:`environment.Environment.metrics_params`.
See the documentation for :func:`metrics.get_formatted_target_metric` for more info.
Any values returned by, or given as the `target_metric` input to,
:func:`~hyperparameter_hunter.metrics.get_formatted_target_metric` are acceptable
values for :attr:`BaseOptPro.target_metric`
iterations: Int, default=1
Number of Experiments to conduct during optimization upon invoking :meth:`BaseOptPro.go`
verbose: {0, 1, 2}, default=1
Verbosity mode for console logging. 0: Silent. 1: Show only logs from the Optimization
Protocol. 2: In addition to logs shown when verbose=1, also show the logs from
individual Experiments
read_experiments: Boolean, default=True
If True, all Experiment records that fit in the current :attr:`space` and guidelines,
and match :attr:`algorithm_name`, will be read in and used to fit any optimizers
reporter_parameters: Dict, or None, default={}
Additional parameters passed to :meth:`reporting.OptimizationReporter.__init__`. Note:
Unless provided explicitly, the key "do_maximize" will be added by default to
`reporter_params`, with a value inferred from the `direction` of :attr:`target_metric`
in `G.Env.metrics`. In nearly all cases, the "do_maximize" key should be ignored,
as there are very few reasons to explicitly include it
warn_on_re_ask: Boolean, default=False
If True, and the internal `optimizer` recommends a point that has already been evaluated
on invocation of `ask`, a warning is logged before recommending a random point. Either
way, a random point is used instead of already-evaluated recommendations. However,
logging the fact that this has taken place can be useful to indicate that the optimizer
may be stalling, especially if it repeatedly recommends the same point. In these cases,
if the suggested point is not optimal, it can be helpful to switch a different OptPro
(especially `DummyOptPro`), which will suggest points using different criteria
Methods
-------
forge_experiment
Define constraints on Experiments conducted by OptPro (like hyperparameter search space)
go
Start optimization
Notes
-----
By default, 'script_backup' for Experiments is blacklisted when executed within
:class:`BaseOptPro` since it would just repeatedly create copies of the same, unchanged
file. So don't expect any script_backup files for Experiments executed by OptPros"""
#################### Optimization Protocol Parameters ####################
self.target_metric = target_metric
self.iterations = iterations
self.verbose = verbose
self.read_experiments = read_experiments
self.reporter_parameters = reporter_parameters or {}
self.warn_on_re_ask = warn_on_re_ask
#################### Experiment Guidelines ####################
self.model_initializer = None
self.model_init_params = None
self.model_extra_params = None
self.feature_engineer = None
self.feature_selector = None
self.notes = None
self.do_raise_repeated = True
#################### Search Parameters ####################
self.dimensions = []
self.search_bounds = dict()
self.space = None
self.similar_experiments = []
self.best_experiment = None
self.best_score = None
self.successful_iterations = 0
self.skipped_iterations = 0
self.tested_keys = []
self._search_space_size = None
#################### Incumbent Hyperparameters ####################
self.current_init_params = None
self.current_extra_params = None
self.current_feature_engineer = None
#################### Identification Attributes ####################
self.algorithm_name = None
self.module_name = None
self.current_experiment = None
self.current_score = None
#################### Keras-Specific Attributes ####################
self.dummy_layers = []
self.dummy_compile_params = dict()
#################### Secret Sauce ####################
self.init_iter_attrs = []
self.extra_iter_attrs = []
self.fe_iter_attrs = [lambda p, k, v: isinstance(v, FeatureEngineer)]
self.logger = None
self._preparation_workflow()
self.do_maximize = G.Env.metrics[self.target_metric[-1]].direction == "max"
##################################################
# Core Methods:
##################################################
def forge_experiment(
self,
model_initializer,
model_init_params=None,
model_extra_params=None,
feature_engineer=None,
feature_selector=None,
notes=None,
do_raise_repeated=True,
):
"""Define hyperparameter search scaffold for building Experiments during optimization
OptPros use this method to guide Experiment construction behind the scenes, which is why it
looks just like :meth:`hyperparameter_hunter.experiments.BaseExperiment.__init__`.
`forge_experiment` offers one major upgrade to standard Experiment initialization: it
accepts hyperparameters not only as concrete values, but also as space choices -- using
:class:`~hyperparameter_hunter.space.dimensions.Real`,
:class:`~hyperparameter_hunter.space.dimensions.Integer`, and
:class:`~hyperparameter_hunter.space.dimensions.Categorical`. This functionality applies to
the `model_init_params`, `model_extra_params` and `feature_engineer` kwargs. Any Dimensions
provided to `forge_experiment` are detected by the OptPro and used to define the
hyperparameter search space to be optimized
Parameters
----------
model_initializer: Class, or functools.partial, or class instance
Algorithm class used to initialize a model, such as XGBoost's `XGBRegressor`, or
SKLearn's `KNeighborsClassifier`; although, there are hundreds of possibilities across
many different ML libraries. `model_initializer` is expected to define at least `fit`
and `predict` methods. `model_initializer` will be initialized with `model_init_params`,
and its extra methods (`fit`, `predict`, etc.) will be invoked with parameters in
`model_extra_params`
model_init_params: Dict, or object (optional)
Dictionary of arguments given to create an instance of `model_initializer`. Any kwargs
that are considered valid by the `__init__` method of `model_initializer` are valid in
`model_init_params`.
In addition to providing concrete values, hyperparameters can be expressed as choices
(dimensions to optimize) by using instances of
:class:`~hyperparameter_hunter.space.dimensions.Real`,
:class:`~hyperparameter_hunter.space.dimensions.Integer`, or
:class:`~hyperparameter_hunter.space.dimensions.Categorical`. Furthermore,
hyperparameter choices and concrete values can be used together in `model_init_params`.
Using XGBoost's `XGBClassifier` to illustrate, the `model_init_params` kwarg of
:class:`~hyperparameter_hunter.experiments.CVExperiment` is limited to using concrete
values, such as ``dict(max_depth=10, learning_rate=0.1, booster="gbtree")``. This is
still valid for :meth:`.forge_experiment`. However, :meth:`.forge_experiment` also
allows `model_init_params` to consist entirely of space choices, such as
``dict(max_depth=Integer(2, 20), learning_rate=Real(0.001, 0.5),
booster=Categorical(["gbtree", "dart"]))``, or as any combination of concrete values
and choices, for instance, ``dict(max_depth=10, learning_rate=Real(0.001, 0.5),
booster="gbtree")``.
One of the key features that makes HyperparameterHunter so magical is that **ALL**
hyperparameters in the signature of `model_initializer` (and their default values) are
discovered -- whether or not they are explicitly given in `model_init_params`. Not only
does this make Experiment result descriptions incredibly thorough, it also makes
optimization smoother, more effective, and far less work for the user. For example, take
LightGBM's `LGBMRegressor`, with `model_init_params`=`dict(learning_rate=0.2)`.
HyperparameterHunter recognizes that this differs from the default of 0.1. It also
recognizes that `LGBMRegressor` is actually initialized with more than a dozen other
hyperparameters we didn't bother mentioning, and it records their values, too. So if we
want to optimize `num_leaves` tomorrow, the OptPro doesn't start from scratch. It knows
that we ran an Experiment that didn't explicitly mention `num_leaves`, but its default
value was 31, and it uses this information to fuel optimization -- all without us having
to manually keep track of tons of janky collections of hyperparameters. In fact, we
really don't need to go out of our way at all. HyperparameterHunter just acts as our
faithful lab assistant, keeping track of all the stuff we'd rather not worry about
model_extra_params: Dict (optional)
Dictionary of extra parameters for models' non-initialization methods (like `fit`,
`predict`, `predict_proba`, etc.), and for neural networks. To specify parameters for
an extra method, place them in a dict named for the extra method to which the
parameters should be given. For example, to call `fit` with `early_stopping_rounds`=5,
use `model_extra_params`=`dict(fit=dict(early_stopping_rounds=5))`.
Declaring hyperparameter space choices works identically to `model_init_params`, meaning
that in addition to concrete values, extra parameters can be given as instances of
:class:`~hyperparameter_hunter.space.dimensions.Real`,
:class:`~hyperparameter_hunter.space.dimensions.Integer`, or
:class:`~hyperparameter_hunter.space.dimensions.Categorical`. To optimize over a space
in which `early_stopping_rounds` is between 3 and 9, use
`model_extra_params`=`dict(fit=dict(early_stopping_rounds=Real(3, 9)))`.
For models whose `fit` methods have a kwarg like `eval_set` (such as XGBoost's), one can
use the `DatasetSentinel` attributes of the current active
:class:`~hyperparameter_hunter.environment.Environment`, documented under its
"Attributes" section and under
:attr:`~hyperparameter_hunter.environment.Environment.train_input`. An example using
several DatasetSentinels can be found in HyperparameterHunter's
[XGBoost Classification Example](https://github.com/HunterMcGushion/hyperparameter_hunter/blob/master/examples/xgboost_examples/classification.py)
feature_engineer: `FeatureEngineer`, or list (optional)
Feature engineering/transformation/pre-processing steps to apply to datasets defined in
:class:`~hyperparameter_hunter.environment.Environment`. If list, will be used to
initialize :class:`~hyperparameter_hunter.feature_engineering.FeatureEngineer`, and can
contain any of the following values:
1. :class:`~hyperparameter_hunter.feature_engineering.EngineerStep` instance
2. Function input to :class:~hyperparameter_hunter.feature_engineering.EngineerStep`
3. :class:`~hyperparameter_hunter.space.dimensions.Categorical`, with `categories`
comprising a selection of the previous two values (optimization only)
For important information on properly formatting `EngineerStep` functions, please see
the documentation of :class:`~hyperparameter_hunter.feature_engineering.EngineerStep`.
To search a space optionally including an `EngineerStep`, use the `optional` kwarg of
:class:`~hyperparameter_hunter.space.dimensions.Categorical`. This functionality is
illustrated in :class:`~hyperparameter_hunter.feature_engineering.FeatureEngineer`. If
using a `FeatureEngineer` instance to optimize `feature_engineer`, this instance cannot
be used with `CVExperiment` because Experiments can't handle space choices
feature_selector: List of str, callable, or list of booleans (optional)
Column names to include as input data for all provided DataFrames. If None,
`feature_selector` is set to all columns in :attr:`train_dataset`, less
:attr:`target_column`, and :attr:`id_column`. `feature_selector` is provided as the
second argument for calls to `pandas.DataFrame.loc` when constructing datasets
notes: String (optional)
Additional information about the Experiment that will be saved with the Experiment's
description result file. This serves no purpose other than to facilitate saving
Experiment details in a more readable format
do_raise_repeated: Boolean, default=False
If True and this Experiment locates a previous Experiment's results with matching
Environment and Hyperparameter Keys, a RepeatedExperimentError will be raised. Else, a
warning will be logged
Notes
-----
The `auto_start` kwarg is not available here because :meth:`._execute_experiment` sets it
to False in order to check for duplicated keys before running the whole Experiment. This
and `target_metric` being moved to :meth:`.__init__` are the most notable differences
between calling :meth:`forge_experiment` and instantiating
:class:`~hyperparameter_hunter.experiments.CVExperiment`
A more accurate name for this method might be something like "build_experiment_forge", since
`forge_experiment` itself does not actually execute any Experiments. However,
`forge_experiment` sounds cooler and much less clunky
See Also
--------
:class:`hyperparameter_hunter.experiments.BaseExperiment`
One-off experimentation counterpart to an OptPro's :meth:`.forge_experiment`.
Internally, OptPros feed the processed arguments from `forge_experiment` to initialize
Experiments. This hand-off to Experiments takes place in :meth:`._execute_experiment`
"""
self.model_initializer = model_initializer
self.model_init_params = identify_algorithm_hyperparameters(self.model_initializer)
model_init_params = model_init_params if model_init_params is not None else {}
try:
self.model_init_params.update(model_init_params)
except TypeError:
self.model_init_params.update(dict(build_fn=model_init_params))
self.model_extra_params = model_extra_params if model_extra_params is not None else {}
self.feature_engineer = feature_engineer
if not isinstance(self.feature_engineer, FeatureEngineer):
self.feature_engineer = FeatureEngineer(self.feature_engineer)
self.feature_selector = feature_selector if feature_selector is not None else []
self.notes = notes
self.do_raise_repeated = do_raise_repeated
if self.do_raise_repeated is False:
G.warn_("WARNING: Setting `do_raise_repeated`=False allows duplicated Experiments")
self.algorithm_name, self.module_name = identify_algorithm(self.model_initializer)
self._validate_guidelines()
#################### Deal with Keras ####################
if self.module_name == "keras":
reusable_build_fn, reusable_wrapper_params, dummy_layers, dummy_compile_params = keras_prep_workflow(
self.model_initializer,
self.model_init_params["build_fn"],
self.model_extra_params,
self.source_script,
)
self.model_init_params = dict(build_fn=reusable_build_fn)
self.model_extra_params = reusable_wrapper_params
self.dummy_layers = dummy_layers
self.dummy_compile_params = dummy_compile_params
# FLAG: Deal with capitalization conflicts when comparing similar experiments: `optimizer`='Adam' vs 'adam'
self.set_dimensions()
@Deprecated(
v_deprecate="3.0.0a2",
v_remove="3.2.0",
v_current=__version__,
details="Renamed to `forge_experiment`",
)
def set_experiment_guidelines(self, *args, **kwargs):
self.forge_experiment(*args, **kwargs)
def set_dimensions(self):
"""Locate given hyperparameters that are `space` choice declarations and add them to
:attr:`dimensions`"""
all_dimension_choices = []
#################### Remap Extra Objects ####################
if self.module_name == "keras":
from tensorflow.keras.initializers import Initializer as KerasInitializer
from tensorflow.keras.callbacks import Callback as KerasCB
self.init_iter_attrs.append(lambda _p, _k, _v: isinstance(_v, KerasInitializer))
self.extra_iter_attrs.append(lambda _p, _k, _v: isinstance(_v, KerasCB))
#################### Collect Choice Dimensions ####################
init_dim_choices = get_choice_dimensions(self.model_init_params, self.init_iter_attrs)
extra_dim_choices = get_choice_dimensions(self.model_extra_params, self.extra_iter_attrs)
fe_dim_choices = get_choice_dimensions(self.feature_engineer, self.fe_iter_attrs)
for (path, choice) in init_dim_choices:
choice._name = ("model_init_params",) + path
all_dimension_choices.append(choice)
for (path, choice) in extra_dim_choices:
choice._name = ("model_extra_params",) + path
all_dimension_choices.append(choice)
for (path, choice) in fe_dim_choices:
choice._name = ("feature_engineer",) + path
all_dimension_choices.append(choice)
self.dimensions = all_dimension_choices
if self.module_name == "keras":
self.model_extra_params = link_choice_ids(
self.dummy_layers,
self.dummy_compile_params,
self.model_extra_params,
self.dimensions,
)
def get_ready(self):
"""Prepare for optimization by finalizing hyperparameter space and identifying similar
Experiments. This method is automatically invoked when :meth:`go` is called if necessary"""
if self.model_initializer is None:
raise ValueError("Must invoke `forge_experiment` before starting optimization")
_reporter_params = dict(dict(do_maximize=self.do_maximize), **self.reporter_parameters)
self.logger = OptimizationReporter(self.dimensions, **_reporter_params)
self.tested_keys = []
self._set_hyperparameter_space()
self._find_similar_experiments()
def go(self, force_ready=True):
"""Execute hyperparameter optimization, building an Experiment for each iteration
This method may only be invoked after invoking :meth:`.forge_experiment`, which defines
experiment guidelines and search dimensions. `go` performs a few important tasks: 1)
Formally setting the hyperparameter space; 2) Locating similar experiments to be used as
learning material (for OptPros that suggest incumbent search points by estimating utilities
using surrogate models); and 3) Actually setting off the optimization process, via
:meth:`._optimization_loop`
Parameters
----------
force_ready: Boolean, default=False
If True, :meth:`get_ready` will be invoked even if it has already been called. This will
re-initialize the hyperparameter `space` and `similar_experiments`. Standard behavior is
for :meth:`go` to invoke :meth:`get_ready`, so `force_ready` is ignored unless
:meth:`get_ready` has been manually invoked"""
if force_ready or self.space is None:
self.get_ready()
loop_start_time = datetime.now()
self._optimization_loop()
loop_end_time = datetime.now()
G.log_(f"Optimization loop completed in {loop_end_time - loop_start_time}")
G.log_(f'Best score was {self.best_score} from Experiment "{self.best_experiment}"')
self._clean_up_optimization()
##################################################
# Helper Methods:
##################################################
def _optimization_loop(self, iteration=0):
"""Perform Experiment execution loop while `iteration` < `iterations`. At each iteration, an
Experiment will be executed, its results will be logged, and it will be compared to the
current best experiment
Parameters
----------
iteration: Int, default=0
The current iteration in the optimization loop"""
self.logger.print_optimization_header()
while iteration < self.iterations:
try:
self._execute_experiment()
except RepeatedExperimentError:
# G.debug_(F'Skipping repeated Experiment: {_ex!s}\n')
if len(self.similar_experiments) + len(self.tested_keys) >= self.search_space_size:
G.log_(f"Hyperparameter search space has been exhausted")
break
self.skipped_iterations += 1
continue
# NOTE: If reimplementing grid search, like `UninformedOptimizationProtocol`, add
# `except StopIteration` and see this commit, and 9b7ca73 / e2c3b73 (October 25, 2018)
self.logger.print_result(
self.current_hyperparameters_list,
self.current_score,
experiment_id=self.current_experiment.experiment_id,
)
#################### Update Best Experiment ####################
if (
(self.best_experiment is None) # First evaluation
or (self.do_maximize and (self.best_score < self.current_score)) # New best max
or (not self.do_maximize and (self.best_score > self.current_score)) # New best min
):
self.best_experiment = self.current_experiment.experiment_id
self.best_score = self.current_score
iteration += 1
def _execute_experiment(self):
"""Instantiate and run a :class:`experiments.CVExperiment` after checking for duplicate keys
Notes
-----
As described in the Notes of :meth:`BaseOptPro.forge_experiment`, the
`auto_start` kwarg of :meth:`experiments.CVExperiment.__init__` is set to False in order to
check for duplicated keys"""
self._update_current_hyperparameters()
#################### Initialize Experiment (Without Running) ####################
self.current_experiment = CVExperiment(
model_initializer=self.model_initializer,
model_init_params=self.current_init_params,
model_extra_params=self.current_extra_params,
feature_engineer=self.current_feature_engineer,
feature_selector=self.feature_selector, # TODO: Add `current_feature_selector`
notes=self.notes,
do_raise_repeated=self.do_raise_repeated,
auto_start=False,
)
# Fix `current_experiment.source_script` - Probably saying "protocol_core", which is a lie
self.current_experiment.source_script = self.source_script
#################### Run Experiment ####################
self.current_experiment.preparation_workflow()
self.current_experiment.experiment_workflow()
# If above raised `RepeatedExperimentError`, it is caught by :meth:`_optimization_loop`,
# stopping this method before it can incorrectly update `tested_keys` below
# Future Hunter, if multi-cross_experiment_keys ever supported, this will be a problem. Should've fixed it earlier, dummy
if self.current_experiment.hyperparameter_key.key not in self.tested_keys:
self.tested_keys.append(self.current_experiment.hyperparameter_key.key)
self.current_score = get_path(
self.current_experiment.last_evaluation_results, self.target_metric
)
self.successful_iterations += 1
self._clean_up_experiment()
@staticmethod
def _clean_up_optimization():
"""Perform any cleanup necessary after completion of the optimization loop. Most notably,
this handles removal of temporary model files created for Keras optimization"""
for (root, dirs, files) in walk(TEMP_MODULES_DIR_PATH, topdown=False):
for file in files:
if file.startswith("__temp_"):
remove(f"{root}/{file}")
try:
rmdir(root)
except OSError:
G.warn_(f"Unidentified file found in temporary directory: {root}")
def _clean_up_experiment(self):
"""Perform any cleanup necessary after completion of an Experiment"""
if self.module_name == "keras":
K.clear_session()
@staticmethod
def _select_params(param_prefix: str, current_params: Dict[tuple, Any]):
"""Retrieve sub-dict of `current_params` whose keys start with `param_prefix`
Parameters
----------
param_prefix: {"model_init_params", "model_extra_params", "feature_engineer"}
Target to filter keys of `current_params`. Only key/value pairs in `current_params`
whose keys start with `param_prefix` are returned. `param_prefix` is dropped from the
keys in the returned dict
current_params: Dict[tuple, Any]
Dict from which to return the subset whose keys start with `param_prefix`
Returns
-------
Dict[tuple, Any]
Contents of `current_params`, whose keys started with `param_prefix` - with
`param_prefix` dropped from the resulting keys"""
return {k[1:]: v for k, v in current_params if k[0] == param_prefix}
def _update_current_hyperparameters(self):
"""Update :attr:`current_init_params`, and :attr:`current_extra_params` according to the
upcoming set of hyperparameters to be searched"""
current_hyperparameters = self._get_current_hyperparameters().items()
init_params = self._select_params("model_init_params", current_hyperparameters)
extra_params = self._select_params("model_extra_params", current_hyperparameters)
fe_params = self._select_params("feature_engineer", current_hyperparameters)
# TODO: Add `fs_params` for `current_feature_selector`
# FLAG: At this point, `dummy_layers` shows "kernel_initializer" as `orthogonal` instance with "__hh" attrs
# FLAG: HOWEVER, the `orthogonal` instance does have `gain` set to the correct dummy value, ...
# FLAG: ... so it might be ok, as long as experiment matching can still work with that
self.current_init_params = deep_restricted_update(
self.model_init_params, init_params, iter_attrs=self.init_iter_attrs
)
self.current_extra_params = deep_restricted_update(
self.model_extra_params, extra_params, iter_attrs=self.extra_iter_attrs
)
self.current_feature_engineer = deep_restricted_update(
self.feature_engineer, fe_params, iter_attrs=self.fe_iter_attrs
)
# TODO: Add `current_feature_selector`
#################### Initialize `current_feature_engineer` ####################
current_fe = subdict(self.current_feature_engineer, keep=["steps", "do_validate"])
current_fe["steps"] = [_ for _ in current_fe["steps"] if _ != RejectedOptional()]
self.current_feature_engineer = FeatureEngineer(**current_fe)
#################### Deal with Keras ####################
if (self.module_name == "keras") and ("callbacks" in self.current_extra_params):
self.current_extra_params["callbacks"] = reinitialize_callbacks(
self.current_extra_params["callbacks"]
)
# No need to reinitialize Keras `initializers` - Their values are passed to `build_fn` via extra `params`
##################################################
# Abstract Methods:
##################################################
@abstractmethod
def _set_hyperparameter_space(self):
"""Initialize :attr:`space` according to the provided hyperparameter search dimensions"""
@abstractmethod
def _get_current_hyperparameters(self):
"""Retrieve the upcoming set of hyperparameters to be searched
Returns
-------
current_hyperparameters: Dict
The next set of hyperparameters that will be searched"""
@property
@abstractmethod
def search_space_size(self):
"""The number of different hyperparameter permutations possible given the current
hyperparameter search space"""
##################################################
# Utility Methods:
##################################################
def _preparation_workflow(self):
"""Perform housekeeping tasks to prepare for core functionality like validating the
`Environment` and parameters, and updating the verbosity of individual Experiments"""
self._validate_environment()
self._validate_parameters()
self._update_verbosity()
@staticmethod
def _validate_environment():
"""Check that there is a currently active and unoccupied Environment instance"""
if G.Env is None:
raise EnvironmentInactiveError()
if G.Env.current_task is not None:
raise EnvironmentInvalidError("Must finish current task before starting a new one")
def _validate_parameters(self):
"""Ensure provided input parameters are properly formatted"""
self.target_metric = get_formatted_target_metric(
self.target_metric, G.Env.metrics, default_dataset="oof"
)
def _validate_guidelines(self):
"""Ensure provided Experiment guideline parameters are properly formatted"""
target_column = G.Env.target_column
id_column = G.Env.id_column
train_dataset = G.Env.train_dataset.copy()
self.feature_selector = self.feature_selector or train_dataset.columns.values
restricted_cols = [_ for _ in target_column + [id_column] if _ is not None]
self.feature_selector = [_ for _ in self.feature_selector if _ not in restricted_cols]
def _find_similar_experiments(self):
"""Look for Experiments that were performed under similar conditions (algorithm and
cross-experiment parameters)"""
if self.read_experiments is False:
return
model_params = dict(
model_init_params=self.model_init_params,
model_extra_params=self.model_extra_params,
feature_engineer=self.feature_engineer,
feature_selector=self.feature_selector,
)
if self.module_name == "keras":
model_params["model_init_params"]["layers"] = self.dummy_layers
model_params["model_init_params"]["compile_params"] = self.dummy_compile_params
experiment_finder = finder_selector(self.module_name)(
self.algorithm_name,
self.module_name,
G.Env.cross_experiment_key,
self.target_metric,
self.space,
G.Env.result_paths["global_leaderboard"],
G.Env.result_paths["description"],
model_params,
)
experiment_finder.find()
self.similar_experiments = experiment_finder.similar_experiments
self.logger.print_saved_results_header()
def _update_verbosity(self):
"""Update :attr:`environment.Environment.reporting_params` if required by :attr:`verbose`"""
#################### Mute non-critical console logging for Experiments ####################
if self.verbose in [0, 1]:
G.Env.reporting_params.setdefault("console_params", {})["level"] = "CRITICAL"
#################### Blacklist 'script_backup' ####################
G.Env.result_paths["script_backup"] = None
class SKOptPro(BaseOptPro, metaclass=ABCMeta):
def __init__(
self,
target_metric=None,
iterations=1,
verbose=1,
read_experiments=True,
reporter_parameters=None,
warn_on_re_ask=False,
#################### Optimizer Class Parameters ####################
base_estimator="GP",
n_initial_points=10,
acquisition_function="gp_hedge",
acquisition_optimizer="auto",
random_state=32,
acquisition_function_kwargs=None,
acquisition_optimizer_kwargs=None,
#################### Minimizer Parameters ####################
n_random_starts="DEPRECATED",
callbacks=None,
#################### Other Parameters ####################
base_estimator_kwargs=None,
):
"""Base class for SKOpt-based Optimization Protocols
There are two important methods for all :class:`SKOptPro` descendants that should be
invoked after initialization:
1. :meth:`~hyperparameter_hunter.optimization.protocol_core.BaseOptPro.forge_experiment`
2. :meth:`~hyperparameter_hunter.optimization.protocol_core.BaseOptPro.go`
Parameters
----------
target_metric: Tuple, default=("oof", <:attr:`environment.Environment.metrics`[0]>)
Rarely necessary to explicitly provide this, as the default is usually sufficient. Path
denoting the metric to be used to compare Experiment performance. The first value
should be one of ["oof", "holdout", "in_fold"]. The second value should be the name of
a metric being recorded according to :attr:`environment.Environment.metrics_params`.
See the documentation for :func:`metrics.get_formatted_target_metric` for more info.
Any values returned by, or given as the `target_metric` input to,
:func:`~hyperparameter_hunter.metrics.get_formatted_target_metric` are acceptable
values for :attr:`BaseOptPro.target_metric`
iterations: Int, default=1
Number of Experiments to conduct during optimization upon invoking :meth:`BaseOptPro.go`
verbose: {0, 1, 2}, default=1
Verbosity mode for console logging. 0: Silent. 1: Show only logs from the Optimization
Protocol. 2: In addition to logs shown when verbose=1, also show the logs from
individual Experiments
read_experiments: Boolean, default=True
If True, all Experiment records that fit in the current :attr:`space` and guidelines,
and match :attr:`algorithm_name`, will be read in and used to fit any optimizers
reporter_parameters: Dict, or None, default=None
Additional parameters passed to :meth:`reporting.OptimizationReporter.__init__`. Note:
Unless provided explicitly, the key "do_maximize" will be added by default to
`reporter_params`, with a value inferred from the `direction` of :attr:`target_metric`
in `G.Env.metrics`. In nearly all cases, the "do_maximize" key should be ignored,
as there are very few reasons to explicitly include it
warn_on_re_ask: Boolean, default=False
If True, and the internal `optimizer` recommends a point that has already been evaluated
on invocation of `ask`, a warning is logged before recommending a random point. Either
way, a random point is used instead of already-evaluated recommendations. However,
logging the fact that this has taken place can be useful to indicate that the optimizer
may be stalling, especially if it repeatedly recommends the same point. In these cases,
if the suggested point is not optimal, it can be helpful to switch a different OptPro
(especially `DummyOptPro`), which will suggest points using different criteria
Other Parameters
----------------
base_estimator: {SKLearn Regressor, "GP", "RF", "ET", "GBRT", "DUMMY"}, default="GP"
If not string, should inherit from `sklearn.base.RegressorMixin`. In addition, the
`predict` method should have an optional `return_std` argument, which returns
`std(Y | x)`, along with `E[Y | x]`.
If `base_estimator` is a string in {"GP", "RF", "ET", "GBRT", "DUMMY"}, a surrogate
model corresponding to the relevant `X_minimize` function is created
n_initial_points: Int, default=10
Number of complete evaluation points necessary before allowing Experiments to be
approximated with `base_estimator`. Any valid Experiment records found will count as
initialization points. If enough Experiment records are not found, additional points
will be randomly sampled
acquisition_function:{"LCB", "EI", "PI", "gp_hedge"}, default="gp_hedge"
Function to minimize over the posterior distribution. Can be any of the following:
* "LCB": Lower confidence bound
* "EI": Negative expected improvement
* "PI": Negative probability of improvement
* "gp_hedge": Probabilistically choose one of the above three acquisition functions at
every iteration
* The gains `g_i` are initialized to zero
* At every iteration,
* Each acquisition function is optimised independently to propose a candidate
point `X_i`
* Out of all these candidate points, the next point `X_best` is chosen by
`softmax(eta g_i)`
* After fitting the surrogate model with `(X_best, y_best)`, the gains are
updated such that `g_i -= mu(X_i)`
acquisition_optimizer: {"sampling", "lbfgs", "auto"}, default="auto"
Method to minimize the acquisition function. The fit model is updated with the optimal
value obtained by optimizing `acq_func` with `acq_optimizer`
* "sampling": `acq_func` is optimized by computing `acq_func` at `n_initial_points`
randomly sampled points.
* "lbfgs": `acq_func` is optimized by
* Randomly sampling `n_restarts_optimizer` (from `acq_optimizer_kwargs`) points
* "lbfgs" is run for 20 iterations with these initial points to find local minima
* The optimal of these local minima is used to update the prior
* "auto": `acq_optimizer` is configured on the basis of the `base_estimator` and the
search space. If the space is `Categorical` or if the provided estimator is based on
tree-models, then this is set to "sampling"
random_state: Int, `RandomState` instance, or None, default=None
Set to something other than None for reproducible results
acquisition_function_kwargs: Dict, or None, default=dict(xi=0.01, kappa=1.96)
Additional arguments passed to the acquisition function
acquisition_optimizer_kwargs: Dict, or None, default=dict(n_points=10000, n_restarts_optimizer=5, n_jobs=1)
Additional arguments passed to the acquisition optimizer
n_random_starts: ...
.. deprecated:: 3.0.0
Use `n_initial_points`, instead. Will be removed in 3.2.0
callbacks: Callable, list of callables, or None, default=[]
If callable, then `callbacks(self.optimizer_result)` is called after each update to
:attr:`optimizer`. If list, then each callable is called
base_estimator_kwargs: Dict, or None, default={}
Additional arguments passed to `base_estimator` when it is initialized
Methods
-------
forge_experiment
Define constraints on Experiments conducted by OptPro (like hyperparameter search space)
go
Start optimization
Notes
-----
To provide initial input points for evaluation, individual Experiments can be executed prior
to instantiating an Optimization Protocol. The results of these Experiments will
automatically be detected and cherished by the optimizer.
:class:`.SKOptPro` and its children in :mod:`.optimization` rely heavily
on the utilities provided by the `Scikit-Optimize` library, so thank you to the creators and
contributors for their excellent work."""
# TODO: Add 'EIps', and 'PIps' to the allowable `acquisition_function` values - Will need to return execution times
#################### Optimizer Parameters ####################
self.base_estimator = base_estimator
self.n_initial_points = n_initial_points
self.acquisition_function = acquisition_function
self.acquisition_optimizer = acquisition_optimizer
self.random_state = random_state
self.acquisition_function_kwargs = dict(xi=0.01, kappa=1.96)
self.acquisition_function_kwargs.update(acquisition_function_kwargs or {})
self.acquisition_optimizer_kwargs = dict(n_points=10000, n_restarts_optimizer=5, n_jobs=1)
self.acquisition_optimizer_kwargs.update(acquisition_optimizer_kwargs or {})
self.callbacks = callbacks or []
if n_random_starts != "DEPRECATED":
self.n_initial_points = n_random_starts
warn(DeprecatedWarning("n_random_starts", "3.0.0", "3.2.0", "Use `n_initial_points`"))
#################### Other Parameters ####################
self.base_estimator_kwargs = base_estimator_kwargs or {}
#################### Placeholder Attributes ####################
self.optimizer = None
self.optimizer_result = None
self.current_hyperparameters_list = None
super().__init__(
target_metric=target_metric,
iterations=iterations,
verbose=verbose,
read_experiments=read_experiments,
reporter_parameters=reporter_parameters,
warn_on_re_ask=warn_on_re_ask,
)
def _set_hyperparameter_space(self):
"""Initialize :attr:`space` according to the provided hyperparameter search dimensions, and
:attr:`base_estimator`, and :attr:`optimizer`"""
self.space = Space(dimensions=self.dimensions)
self._prepare_estimator()
self._build_optimizer()
def _prepare_estimator(self):
"""Initialize :attr:`base_estimator` with :attr:`space` via `skopt.utils.cook_estimator`"""
self.base_estimator = cook_estimator(
self.base_estimator, space=self.space, **self.base_estimator_kwargs
)
def _build_optimizer(self):
"""Set :attr:`optimizer` to the optimizing class used to both estimate the utility of sets
of hyperparameters by learning from executed Experiments, and suggest points at which the
objective should be evaluated"""
self.optimizer = Optimizer(
dimensions=self.space,
base_estimator=self.base_estimator,
n_initial_points=self.n_initial_points,
acq_func=self.acquisition_function,
acq_optimizer=self.acquisition_optimizer,
random_state=self.random_state,
acq_func_kwargs=self.acquisition_function_kwargs,
acq_optimizer_kwargs=self.acquisition_optimizer_kwargs,
warn_on_re_ask=self.warn_on_re_ask,
)
def _update_optimizer(self, hyperparameters, score, fit=True):
"""Record an observation (or set of observations) of the objective function
To add observations without fitting a new model, `fit`=False. To add multiple observations
in a batch, pass a list-of-lists for `hyperparameters` and a list of scalars for `score`
Parameters
----------
hyperparameters: List:
Hyperparameter values in the search space at which the objective function was evaluated
score: Number, list
Value of the objective function at `hyperparameters` in the hyperparameter space
fit: Boolean, default=True
Fit a model to observed evaluations of the objective. Regardless of `fit`, a model will
only be fitted after telling :attr:`n_initial_points` points to :attr:`optimizer`"""
if self.do_maximize:
score = -score
self.optimizer_result = self.optimizer.tell(hyperparameters, score, fit=fit)
def _execute_experiment(self):
"""After executing parent's :meth:`_execute_experiment`, fit :attr:`optimizer` with the set
of hyperparameters that were used, and the utility of those hyperparameters"""
super()._execute_experiment()
self._update_optimizer(self.current_hyperparameters_list, self.current_score)
if eval_callbacks(self.callbacks, self.optimizer_result):
return
def _get_current_hyperparameters(self):
"""Ask :attr:`optimizer` for the upcoming set of hyperparameters that should be searched,
then format them to be used in the next Experiment
Returns
-------
current_hyperparameters: Dict