Source code for syne_tune.optimizer.schedulers.searchers.gp_fifo_searcher

# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from typing import Optional, List, Dict, Any
import logging

from syne_tune.optimizer.schedulers.searchers.model_based_searcher import (
    BayesianOptimizationSearcher,
)
from syne_tune.optimizer.schedulers.searchers.gp_searcher_factory import (
    gp_fifo_searcher_factory,
    gp_fifo_searcher_defaults,
)
from syne_tune.optimizer.schedulers.searchers.gp_searcher_utils import (
    decode_state,
)
from syne_tune.optimizer.schedulers.searchers.utils.default_arguments import (
    check_and_merge_defaults,
)

logger = logging.getLogger(__name__)



[docs]
class GPFIFOSearcher(BayesianOptimizationSearcher):
    r"""Gaussian process Bayesian optimization for FIFO scheduler

    This searcher must be used with
    :class:`~syne_tune.optimizer.schedulers.FIFOScheduler`. It provides
    Bayesian optimization, based on a Gaussian process surrogate model.

    It is *not* recommended creating :class:`GPFIFOSearcher` searcher objects
    directly, but rather to create
    :class:`~syne_tune.optimizer.schedulers.FIFOScheduler` objects with
    ``searcher="bayesopt"``, and passing arguments here in ``search_options``.
    This will use the appropriate functions from
    :mod:``syne_tune.optimizer.schedulers.searchers.gp_searcher_factory`` to
    create components in a consistent way.

    Most of the implementation is generic in
    :class:`~syne_tune.optimizer.schedulers.searchers.model_based_searcher.BayesianOptimizationSearcher`.

    Note: If metric values are to be maximized (``mode-"max"`` in scheduler),
    the searcher uses ``map_reward`` to map metric values to internal
    criterion values, and *minimizes* the latter. The default choice is
    to multiply values by -1.

    Pending configurations (for which evaluation tasks are currently running)
    are dealt with by fantasizing (i.e., target values are drawn from the
    current posterior, and acquisition functions are averaged over this
    sample, see ``num_fantasy_samples``).

    The GP surrogate model uses a Matern 5/2 covariance function with automatic
    relevance determination (ARD) of input attributes, and a constant mean
    function. The acquisition function is expected improvement (EI). All
    hyperparameters of the surrogate model are estimated by empirical Bayes
    (maximizing the marginal likelihood). In general, this hyperparameter
    fitting is the most expensive part of a :meth:`get_config` call.

    Note that the full logic of construction based on arguments is given in
    :mod:``syne_tune.optimizer.schedulers.searchers.gp_searcher_factory``. In
    particular, see
    :func:`~syne_tune.optimizer.schedulers.searchers.gp_searcher_factory.gp_fifo_searcher_defaults`
    for default values.

    Additional arguments on top of parent class
    :class:`~syne_tune.optimizer.schedulers.searchers.StochasticSearcher`:

    :param clone_from_state: Internal argument, do not use
    :type clone_from_state: bool
    :param resource_attr: Name of resource attribute in reports. This is
        optional here, but required for multi-fidelity searchers.
        If ``resource_attr`` and ``cost_attr`` are given, cost values are read from
        each report and stored in the state. This allows cost models to be fit
        on more data.
    :type resource_attr: str, optional
    :param cost_attr: Name of cost attribute in data obtained from reporter
        (e.g., elapsed training time). Needed only by cost-aware searchers.
        Depending on whether ``resource_attr`` is given, cost values are read
        from each report or only at the end.
    :type cost_attr: str, optional
    :param num_init_random: Number of initial :meth:`get_config` calls for which
        randomly sampled configs are returned. Afterwards, the model-based
        searcher is used. Defaults to
        :const:`~syne_tune.optimizer.schedulers.searchers.bayesopt.tuning_algorithms.defaults.DEFAULT_NUM_INITIAL_RANDOM_EVALUATIONS`
    :type num_init_random: int, optional
    :param num_init_candidates: Number of initial candidates sampled at
        random in order to seed the model-based search in ``get_config``.
        Defaults to :const:`~syne_tune.optimizer.schedulers.searchers.bayesopt.tuning_algorithms.defaults.DEFAULT_NUM_INITIAL_CANDIDATES`
    :type num_init_candidates: int, optional
    :param num_fantasy_samples: Number of samples drawn for fantasizing
        (latent target values for pending evaluations), defaults to 20
    :type num_fantasy_samples: int, optional
    :param no_fantasizing: If True, fantasizing is not done and pending
        evaluations are ignored. This may lead to loss of diversity in
        decisions. Defaults to ``False``
    :type no_fantasizing: bool, optional
    :param input_warping: If ``True``, we use a warping transform, so the kernel
        function becomes :math:`k(w(x), w(x'))`, where :math:`w(x)` is a warping
        transform parameterized by two non-negative numbers per component, which
        are learned as hyperparameters. See also
        :class:`~syne_tune.optimizer.schedulers.searcher.bayesopt.gpautograd.warping.Warping`.
        Coordinates which belong to categorical hyperparameters, are not warped.
        Defaults to ``False``.
    :type input_warping: bool, optional
    :param boxcox_transform: If ``True``, target values are transformed before
        being fitted with a Gaussian marginal likelihood. This is using the Box-Cox
        transform with a parameter :math:`\lambda`, which is learned alongside
        other parameters of the surrogate model. The transform is :math:`\log y`
        for :math:`\lambda = 0`, and :math:`y - 1` for :math:`\lambda = 1`. This
        option requires the targets to be positive. Defaults to ``False``.
    :type boxcox_transform: bool, optional
    :param gp_base_kernel: Selects the covariance (or kernel) function to be
        used. Supported choices are
        :const:`~syne_tune.optimizer.schedulers.searchers.bayesopt.models.kernel_factory.SUPPORTED_BASE_MODELS`.
        Defaults to "matern52-ard" (Matern 5/2 with automatic relevance
        determination).
    :type gp_base_kernel: str, optional
    :param acq_function: Selects the acquisition function to be used. Supported
        choices are
        :const:`~syne_tune.optimizer.schedulers.searchers.bayesopt.models.acqfunc_factory.SUPPORTED_ACQUISITION_FUNCTIONS`.
        Defaults to "ei" (expected improvement acquisition function).
    :type acq_function: str, optional
    :param acq_function_kwargs: Some acquisition functions have additional
        parameters, they can be passed here. If none are given, default values
        are used.
    :type acq_function_kwargs: dict, optional
    :param initial_scoring: Scoring function to rank initial candidates
        (local optimization of EI is started from top scorer):

        * "thompson_indep": Independent Thompson sampling; randomized score,
          which can increase exploration
        * "acq_func": score is the same (EI) acquisition function which is
          used for local optimization afterwards

        Defaults to
        :const:`~syne_tune.optimizer.schedulers.searchers.bayesopt.tuning_algorithms.defaults.DEFAULT_INITIAL_SCORING`
    :type initial_scoring: str, optional
    :param skip_local_optimization: If ``True``, the local gradient-based
        optimization of the acquisition function is skipped, and the
        top-ranked initial candidate (after initial scoring) is returned
        instead. In this case, ``initial_scoring="acq_func"`` makes most
        sense, otherwise the acquisition function will not be used.
        Defaults to False
    :type skip_local_optimization: bool, optional
    :param opt_nstarts: Parameter for surrogate model fitting. Number of
        random restarts. Defaults to 2
    :type opt_nstarts: int, optional
    :param opt_maxiter: Parameter for surrogate model fitting. Maximum
        number of iterations per restart. Defaults to 50
    :type opt_maxiter: int, optional
    :param opt_warmstart: Parameter for surrogate model fitting. If ``True``,
        each fitting is started from the previous optimum. Not recommended
        in general. Defaults to ``False``
    :type opt_warmstart: bool, optional
    :param opt_verbose: Parameter for surrogate model fitting. If ``True``,
        lots of output. Defaults to ``False``
    :type opt_verbose: bool, optional
    :param max_size_data_for_model: If this is set, we limit the number of
        observations the surrogate model is fitted on this value. If there are
        more observations, they are down sampled, see
        :class:`~syne_tune.optimizer.schedulers.searchers.bayesopt.models.subsample_state.SubsampleSingleFidelityStateConverter`
        for details. This down sampling is repeated every time the model is
        fit. The ``opt_skip_*`` predicates are evaluated before the state is
        downsampled. Pass ``None`` not to apply such a threshold. The default is
        :const:`~syne_tune.optimizer.schedulers.searchers.bayesopt.tuning_algorithms.defaults.DEFAULT_MAX_SIZE_DATA_FOR_MODEL`.
    :type max_size_data_for_model: int, optional
    :param max_size_top_fraction: Only used if ``max_size_data_for_model`` is
        set. This fraction of the down sampled set is filled with the top entries
        in the full set, the remaining ones are sampled at random from the full
        set, see
        :class:`~syne_tune.optimizer.schedulers.searchers.bayesopt.models.subsample_state.SubsampleSingleFidelityStateConverter`
        for details. Defaults to 0.25.
    :type max_size_top_fraction: float, optional
    :param opt_skip_init_length: Parameter for surrogate model fitting,
        skip predicate. Fitting is never skipped as long as number of
        observations below this threshold. Defaults to 150
    :type opt_skip_init_length: int, optional
    :param opt_skip_period: Parameter for surrogate model fitting, skip
        predicate. If ``>1``, and number of observations above
        ``opt_skip_init_length``, fitting is done only K-th call, and skipped
        otherwise. Defaults to 1 (no skipping)
    :type opt_skip_period: int, optional
    :param allow_duplicates: If ``True``, :meth:`get_config` may return the same
        configuration more than once. Defaults to ``False``
    :type allow_duplicates: bool, optional
    :param restrict_configurations: If given, the searcher only suggests
        configurations from this list. This needs
        ``skip_local_optimization == True``. If ``allow_duplicates == False``,
        entries are popped off this list once suggested.
    :type restrict_configurations: List[dict], optional
    :param map_reward: In the scheduler, the metric may be minimized or
        maximized, but internally, Bayesian optimization is minimizing
        the criterion. ``map_reward`` converts from metric to internal
        criterion:

        * "minus_x": ``criterion = -metric``
        * "<a>_minus_x": ``criterion = <a> - metric``. For example "1_minus_x"
          maps accuracy to zero-one error

        From a technical standpoint, it does not matter what is chosen here,
        because criterion is only used internally. Also note that criterion
        data is always normalized to mean 0, variance 1 before fitted with a
        Gaussian process. Defaults to "1_minus_x"
    :type map_reward: str or :class:`MapReward`, optional
    :param transfer_learning_task_attr: Used to support transfer HPO, where
        the state contains observed data from several tasks, one of which
        is the active one. To this end, ``config_space`` must contain a
        categorical parameter of name ``transfer_learning_task_attr``, whose
        range are all task IDs. Also, ``transfer_learning_active_task`` must
        denote the active task, and ``transfer_learning_active_config_space``
        is used as ``active_config_space`` argument in
        :class:`~syne_tune.optimizer.schedulers.searchers.utils.HyperparameterRanges`.
        This allows us to use a narrower search space for the active task than
        for the union of all tasks (``config_space`` must be that), which is
        needed if some configurations of non-active tasks lie outside of the
        ranges in ``active_config_space``. One of the implications is that
        :meth:`filter_observed_data` is selecting configs of the active task,
        so that incumbents or exclusion lists are restricted to data from the
        active task.
    :type transfer_learning_task_attr: str, optional
    :param transfer_learning_active_task: See ``transfer_learning_task_attr``.
    :type transfer_learning_active_task: str, optional
    :param transfer_learning_active_config_space:
        See ``transfer_learning_task_attr``. If not given, ``config_space`` is the
        search space for the active task as well. This active config space need
        not contain the ``transfer_learning_task_attr`` parameter. In fact, this
        parameter is set to a categorical with ``transfer_learning_active_task``
        as single value, so that new configs are chosen for the active task
        only.
    :type transfer_learning_active_config_space: Dict[str, Any], optional
    :param transfer_learning_model: See ``transfer_learning_task_attr``.
        Specifies the surrogate model to be used for transfer learning:

        * "matern52_product": Kernel is product of Matern 5/2 (not ARD) on
          ``transfer_learning_task_attr`` and Matern 5/2 (ARD) on the rest.
          Assumes that data from same task are more closely related than
          data from different tasks
        * "matern52_same": Kernel is Matern 5/2 (ARD) on the rest of the
          variables, ``transfer_learning_task_attr`` is ignored. Assumes
          that data from all tasks can be merged together

        Defaults to "matern52_product"
    :type transfer_learning_model: str, optional
    """

    def __init__(
        self,
        config_space: Dict[str, Any],
        metric: str,
        points_to_evaluate: Optional[List[Dict[str, Any]]] = None,
        clone_from_state: bool = False,
        **kwargs,
    ):
        super().__init__(
            config_space,
            metric=metric,
            points_to_evaluate=points_to_evaluate,
            random_seed_generator=kwargs.get("random_seed_generator"),
            random_seed=kwargs.get("random_seed"),
        )
        if not clone_from_state:
            kwargs["config_space"] = config_space
            kwargs["metric"] = metric
            kwargs_int = self._create_kwargs_int(kwargs)
        else:
            # Internal constructor, bypassing the factory
            # Note: Members which are part of the mutable state, will be
            # overwritten in ``_restore_from_state``
            kwargs_int = kwargs.copy()
        self._call_create_internal(kwargs_int)

    def _create_kwargs_int(self, kwargs):
        _kwargs = check_and_merge_defaults(
            kwargs, *gp_fifo_searcher_defaults(kwargs), dict_name="search_options"
        )
        kwargs_int = gp_fifo_searcher_factory(**_kwargs)
        # Extra arguments not parsed in factory
        self._copy_kwargs_to_kwargs_int(kwargs_int, kwargs)
        return kwargs_int

    def _call_create_internal(self, kwargs_int):
        """
        Part of constructor which can be different in subclasses
        """
        self._create_internal(**kwargs_int)


[docs]
    def clone_from_state(self, state):
        # Create clone with mutable state taken from 'state'
        init_state = decode_state(state["state"], self._hp_ranges_in_state())
        skip_optimization = state["skip_optimization"]
        estimator = self.state_transformer.estimator
        # Call internal constructor
        new_searcher = GPFIFOSearcher(
            **self._new_searcher_kwargs_for_clone(),
            estimator=estimator,
            init_state=init_state,
            skip_optimization=skip_optimization,
        )
        new_searcher._restore_from_state(state)
        # Invalidate self (must not be used afterwards)
        self.state_transformer = None
        return new_searcher