Source code for syne_tune.blackbox_repository.blackbox_surrogate

# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from typing import Optional, Tuple, List, Dict, Any
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import logging

from syne_tune.config_space import Categorical
from syne_tune.blackbox_repository.blackbox import (
    Blackbox,
    ObjectiveFunctionResult,
)
from syne_tune.blackbox_repository.blackbox_offline import BlackboxOffline

logger = logging.getLogger(__name__)


[docs] class Columns(BaseEstimator, TransformerMixin): def __init__(self, names=None): self.names = names
[docs] def fit(self, *args, **kwargs): return self
[docs] def transform(self, X): return X[self.names]
def _default_surrogate(surrogate): if surrogate is not None: return surrogate else: return KNeighborsRegressor(n_neighbors=1)
[docs] class BlackboxSurrogate(Blackbox): """ Fits a blackbox surrogates that can be evaluated anywhere, which can be useful for supporting interpolation/extrapolation. To wrap an existing blackbox with a surrogate estimator, use :func:`add_surrogate` which automatically extract ``X``, ``y`` matrices from available blackbox evaluations. The surrogate regression model is provided by ``surrogate``, it has to conform to the scikit-learn fit-predict API. If ``predict_curves`` is ``True``, the model maps features of the configuration to the whole curve over fidelities, separate for each metric and seed. This has several advantages. First, predictions are consistent: if all curves in the data respect a certain property which is retained under convex combinations, predictions have this property as well (examples: positivity, monotonicity). This is important for ``elapsed_time`` metrics. The regression models are also fairly compact, and prediction is fast, ``max_fit_samples`` is normally not needed. If ``predict_curves`` is ``False,`` the model maps features from configuration and fidelity to metric values (univariate regression). In this case, properties like monotonicity are not retained. Also, training can take long and the trained models can be large. This difference only matters if there are fidelities. Otherwise, regression is always univariate. If ``num_seeds`` is given, we maintain different surrogate models for each seed. Otherwise, a single surrogate model is fit to data across all seeds. If ``fit_differences`` is given, it contains names of objectives which are cumulative sums. For these objectives, the ``y`` data is transformed to finite differences before fitting the model. This is recommended for ``elapsed_time`` objectives. This feature only matters if there are fidelities. Additional arguments on top of parent class :class:`~syne_tune.blackbox_repository.Blackbox`: :param X: dataframe containing hyperparameters values. Shape is ``(num_seeds * num_evals, num_hps)`` if ``predict_curves`` is ``True``, ``(num_fidelities * num_seeds * num_evals, num_hps)`` otherwise :param y: dataframe containing objectives values. Shape is ``(num_seeds * num_evals, num_fidelities * num_objectives)`` if ``predict_curves`` is ``True``, and ``(num_fidelities * num_seeds * num_evals, num_objectives)`` otherwise :param surrogate: the model that is fitted to predict objectives given any configuration, default to KNeighborsRegressor(n_neighbors=1). If ``predict_curves`` is ``True``, this must be multi-variate regression, i.e. accept target matrices in ``fit``, where columns correspond to fidelities. Regression models from scikit-learn allow for that. Possible examples: :code:`KNeighborsRegressor(n_neighbors=1)`, :code:`MLPRegressor()` or any estimator obeying Scikit-learn API. The model is fit on top of pipeline that applies basic feature-processing to convert rows in ``X`` to vectors. We use the configuration_space hyperparameters types to deduce the types of columns in ``X`` (for instance, :class:`~syne_tune.config_space.Categorical` values are one-hot encoded). :param predict_curves: See above. Default is ``False`` (backwards compatible) :param num_seeds: See above :param fit_differences: See above :param max_fit_samples: maximum number of samples to be fed to the surrogate estimator, if the more data points than this number are passed, then they are subsampled without replacement. If ``num_seeds`` is used, this is a limit on the data per seed :param name: """ def __init__( self, X: pd.DataFrame, y: pd.DataFrame, configuration_space: Dict[str, Any], objectives_names: List[str], fidelity_space: Optional[dict] = None, fidelity_values: Optional[np.array] = None, surrogate=None, predict_curves: bool = False, num_seeds: int = 1, fit_differences: Optional[List[str]] = None, max_fit_samples: Optional[int] = None, name: Optional[str] = None, ): super(BlackboxSurrogate, self).__init__( configuration_space=configuration_space, fidelity_space=fidelity_space, objectives_names=objectives_names, ) assert len(X) == len(y) if self.fidelity_space is None or fidelity_values is None: # Always univariate regression then predict_curves = False fit_differences = None self._assert_shapes( X, y, predict_curves, fidelity_values, num_seeds, len(objectives_names) ) # todo other types of assert with configuration_space, objective_names, ... self.surrogate = _default_surrogate(surrogate) self.surrogate_pipeline = None self.max_fit_samples = max_fit_samples self.predict_curves = predict_curves if fit_differences is None: fit_differences = [] else: fit_differences = [objectives_names.index(name) for name in fit_differences] self.fit_differences = fit_differences self.name = name self._fidelity_values = fidelity_values self.num_seeds = num_seeds self.fit_surrogate(X=X, y=y) @staticmethod def _assert_shapes( X: pd.DataFrame, y: pd.DataFrame, predict_curves: bool, fidelity_values: Optional[np.ndarray], num_seeds: Optional[int], num_objectives: int, ): assert X.ndim == 2 and y.ndim == 2 num_rows = X.shape[0] assert num_rows == y.shape[0] assert num_seeds >= 1 and num_rows % num_seeds == 0 if predict_curves: num_fidelities = len(fidelity_values) assert ( y.shape[1] == num_objectives * num_fidelities ), f"y.shape[1] = {y.shape[1]} != {num_fidelities} * {num_objectives}" elif fidelity_values is not None: num_fidelities = len(fidelity_values) num_evalsxseeds = num_rows // num_fidelities assert ( num_evalsxseeds >= 1 and num_rows == num_evalsxseeds * num_fidelities ), f"X.shape[0] = {num_rows} != {num_fidelities} * {num_evalsxseeds}" assert num_evalsxseeds >= num_seeds assert ( y.shape[1] == num_objectives ), f"y.shape[1] = {y.shape[1]} != {num_objectives}" @property def fidelity_values(self) -> Optional[np.array]: return self._fidelity_values @property def num_fidelities(self) -> int: if self.fidelity_values is not None: num_fidelities = len(self.fidelity_values) else: num_fidelities = 1 return num_fidelities
[docs] @staticmethod def make_model_pipeline( configuration_space, fidelity_space, model, predict_curves=False ): """Create feature pipeline for scikit-learn model :param configuration_space: Configuration space :param fidelity_space: Fidelity space :param model: Scikit-learn model :param predict_curves: Predict full curves? :return: Feature pipeline """ # gets hyperparameters types, categorical for CategoricalHyperparameter, numeric for everything else numeric = [] categorical = [] if fidelity_space is not None and not predict_curves: surrogate_hps = dict() surrogate_hps.update(configuration_space) surrogate_hps.update(fidelity_space) else: surrogate_hps = configuration_space for hp_name, hp in surrogate_hps.items(): if isinstance(hp, Categorical): categorical.append(hp_name) else: numeric.append(hp_name) # builds a pipeline that standardize numeric features and one-hot categorical ones before applying # the surrogate model features_union = [] if len(categorical) > 0: # `sparse` renamed to `sparse_output` in version 1.2. Different # versions are used depending on the Python version try: encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore") except TypeError: encoder = OneHotEncoder(sparse=False, handle_unknown="ignore") features_union.append( ( "categorical", make_pipeline(Columns(names=categorical), encoder), ) ) if len(numeric) > 0: features_union.append( ("numeric", make_pipeline(Columns(names=numeric), StandardScaler())) ) return Pipeline( [ ("features", FeatureUnion(features_union)), ("standard scaler", StandardScaler(with_mean=False)), ("model", model), ] )
def _data_for_seeds(self, X: pd.DataFrame, y: pd.DataFrame): if self.num_seeds == 1: return [X], [y] else: dim1 = self.num_seeds dim0 = 1 if self.predict_curves else self.num_fidelities dim2 = X.shape[0] // (dim1 * dim0) assert dim2 >= 1 and X.shape[0] == dim0 * dim1 * dim2 Xs = [] ys = [] for seed in range(self.num_seeds): index = np.ravel( dim2 * (dim1 * np.arange(dim0).reshape((-1, 1)) + seed) + np.arange(dim2).reshape((1, -1)) ) Xs.append(X.iloc[index]) ys.append(y.iloc[index]) return Xs, ys def _fidelity_spacing(self) -> (np.ndarray, bool): subtract_vec = np.concatenate((np.zeros(1), self.fidelity_values[:-1])) spacing = self.fidelity_values - subtract_vec is_contiguous = np.all(spacing == 1) return spacing, is_contiguous def _transform_to_finite_differences(self, y: pd.DataFrame) -> pd.DataFrame: """ Note: ``fidelity_values`` need not be contiguous (1, 2, 3, ...). We use generalized weighted finite differences to account for that. :param y: Original data (cumulative sum format) :return: Transformed data (finite differences) """ num_fidelities = self.num_fidelities num_objectives = len(self.objectives_names) if num_fidelities > 1 and self.fit_differences: spacing, is_contiguous = self._fidelity_spacing() if self.predict_curves: y_shape = (-1, num_fidelities, num_objectives) else: y_shape = (num_fidelities, -1, num_objectives) y_data = y.to_numpy() y_orig_shape = y_data.shape y_data = y_data.reshape(y_shape) rng_fid_plus = np.arange(1, num_fidelities) rng_fid_minus = np.arange(num_fidelities - 1) for objective_pos in self.fit_differences: if self.predict_curves: y_temp = y_data[:, rng_fid_minus, objective_pos].copy() y_data[:, rng_fid_plus, objective_pos] -= y_temp if not is_contiguous: y_data[:, :, objective_pos] /= spacing.reshape((1, -1)) else: y_temp = y_data[rng_fid_minus, :, objective_pos].copy() y_data[rng_fid_plus, :, objective_pos] -= y_temp if not is_contiguous: y_data[:, :, objective_pos] /= spacing.reshape((-1, 1)) return pd.DataFrame(data=y_data.reshape(y_orig_shape)) else: return y def _transform_from_finite_differences(self, prediction: np.ndarray) -> np.ndarray: """ Note: ``fidelity_values`` need not be contiguous (``1, 2, 3, ...``). We use generalized weighted finite differences to account for that. :param prediction: Shape ``(num_fidelities, num_objectives)`` :return: """ num_fidelities = self.num_fidelities if num_fidelities > 1 and self.fit_differences: spacing, is_contiguous = self._fidelity_spacing() if is_contiguous: spacing = 1 for objective_pos in self.fit_differences: prediction_new = np.cumsum(prediction[:, objective_pos] * spacing) prediction[:, objective_pos] = prediction_new return prediction else: return prediction
[docs] def fit_surrogate(self, X: pd.DataFrame, y: pd.DataFrame) -> Blackbox: """ Fits a surrogate model to data from a blackbox. Here, the targets ``y`` can be a matrix with the number of columns equal to the number of fidelity values (the ``predict_curves = True`` case). """ self.surrogate_pipeline = [ self.make_model_pipeline( configuration_space=self.configuration_space, fidelity_space=self.fidelity_space, model=self.surrogate, predict_curves=self.predict_curves, ) for _ in range(self.num_seeds) ] y = self._transform_to_finite_differences(y) Xs, ys = self._data_for_seeds(X, y) for pipeline, features, targets in zip(self.surrogate_pipeline, Xs, ys): # todo would be nicer to have this in the feature pipeline num_data = len(features) if self.max_fit_samples is not None and self.max_fit_samples < num_data: random_indices = np.random.permutation(num_data)[: self.max_fit_samples] features = features.loc[random_indices] targets = targets.loc[random_indices] pipeline.fit(X=features, y=targets) return self
def _objective_function( self, configuration: Dict[str, Any], fidelity: Optional[dict] = None, seed: Optional[int] = None, ) -> ObjectiveFunctionResult: if seed is None: seed = np.random.randint(0, self.num_seeds) else: assert ( 0 <= seed < self.num_seeds ), f"seed = {seed}, must be in [0, {self.num_seeds - 1}]" surrogate_input = configuration.copy() single_fidelity = fidelity is not None do_fit_diffs = len(self.fit_differences) > 0 if self.fidelity_values is not None: fidelity_attr = self.fidelity_name() else: fidelity_attr = None if not self.predict_curves: # Univariate regression, where fidelity is an input if (not do_fit_diffs) and (single_fidelity or self.fidelity_values is None): if single_fidelity: surrogate_input.update(fidelity) # use the surrogate model for prediction prediction = self.surrogate_pipeline[seed].predict( pd.DataFrame([surrogate_input]) ) # converts the returned nd-array with shape (1, num_metrics) # to the list of objectives values prediction = prediction.reshape(-1).tolist() # convert prediction to dictionary prediction = dict(zip(self.objectives_names, prediction)) else: # when no fidelity is given and a fidelity space exists, we # return all fidelities # we construct a input dataframe with all fidelity for the # configuration given to call the transformer at once which # is more efficient due to vectorization surrogate_input_df = pd.DataFrame( [surrogate_input] * self.num_fidelities ) surrogate_input_df[fidelity_attr] = self.fidelity_values prediction = self._transform_from_finite_differences( self.surrogate_pipeline[seed].predict(surrogate_input_df) ) extract_fidelity = do_fit_diffs and single_fidelity else: # Multivariate regression prediction = self.surrogate_pipeline[seed].predict( pd.DataFrame([surrogate_input]) ) prediction = self._transform_from_finite_differences( prediction.reshape((self.num_fidelities, -1)) ) extract_fidelity = single_fidelity if extract_fidelity: assert self.fidelity_values is not None, "blackbox has no fidelities" # If there are several fidelity values, pick the first fidelity = list(fidelity.values())[0] ind = np.where(self.fidelity_values == fidelity) assert ind, f"fidelity {fidelity} not among {self.fidelity_values}" ind = ind[0] prediction = dict(zip(self.objectives_names, prediction[ind])) return prediction
[docs] def hyperparameter_objectives_values( self, predict_curves: bool = False ) -> Tuple[pd.DataFrame, pd.DataFrame]: raise NotImplementedError("This is a surrogate already!")
[docs] def add_surrogate( blackbox: Blackbox, surrogate=None, configuration_space: Optional[dict] = None, predict_curves: Optional[bool] = None, separate_seeds: bool = False, fit_differences: Optional[List[str]] = None, ): """ Fits a blackbox surrogates that can be evaluated anywhere, which can be useful for supporting interpolation/extrapolation. :param blackbox: the blackbox must implement :meth:`~syne_tune.blackbox_repository.Blackbox.hyperparameter_objectives_values` so that input/output are passed to estimate the model :param surrogate: the model that is fitted to predict objectives given any configuration. Possible examples: :code:`KNeighborsRegressor(n_neighbors=1)`, :code:`MLPRegressor()` or any estimator obeying Scikit-learn API. The model is fit on top of pipeline that applies basic feature-processing to convert rows in ``X`` to vectors. We use ``configuration_space`` to deduce the types of columns in ``X`` (categorical parameters are one-hot encoded). :param configuration_space: configuration space for the resulting blackbox surrogate. The default is ``blackbox.configuration_space``. But note that if ``blackbox`` is tabular, the domains in ``blackbox.configuration_space`` are typically categorical even for numerical parameters. :param predict_curves: If True, the surrogate uses multivariate regression to predict metric curves over fidelities. If False, fidelity is used as input. The latter can lead to inconsistent predictions along fidelity and is typically more expensive. If not given, the default value is ``False`` if ``blackbox`` is of type :class:`~syne_tune.blackbox_repository.BlackboxOffline`, otherwise ``True``. :param separate_seeds: If ``True``, seeds in ``blackbox`` map to seeds in the surrogate blackbox, which fits different models to each seed. If ``False``, the data from ``blackbox`` is merged for all seeds, and the surrogate represents a single seed. The latter provides more data for the surrogate model to be fit, but the variation between seeds is lost in the surrogate. Defaults to ``False``. :param fit_differences: Names of objectives which are cumulative sums. For these objectives, the ``y`` data is transformed to finite differences before fitting the model. This is recommended for ``elapsed_time`` objectives. :return: a blackbox where the output is obtained through the fitted surrogate """ if configuration_space is None: configuration_space = blackbox.configuration_space if separate_seeds and blackbox.fidelity_values is not None: num_seeds = len(blackbox.fidelity_values) else: num_seeds = 1 if predict_curves is None: # ``BlackboxOffline`` does not support True right now predict_curves = not isinstance(blackbox, BlackboxOffline) X, y = blackbox.hyperparameter_objectives_values(predict_curves) return BlackboxSurrogate( X=X, y=y, configuration_space=configuration_space, objectives_names=blackbox.objectives_names, fidelity_space=blackbox.fidelity_space, fidelity_values=blackbox.fidelity_values, surrogate=surrogate, predict_curves=predict_curves, num_seeds=num_seeds, fit_differences=fit_differences, )