Source code for syne_tune.blackbox_repository.blackbox_tabular

# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from pathlib import Path
from typing import List, Dict, Optional, Tuple, Union, Any
import pandas as pd
import numpy as np

from syne_tune.blackbox_repository.blackbox import (
    Blackbox,
    ObjectiveFunctionResult,
)
from syne_tune.blackbox_repository.serialize import (
    serialize_configspace,
    deserialize_configspace,
    deserialize_metadata,
    serialize_metadata,
)


[docs] class BlackboxTabular(Blackbox): """ Blackbox that contains tabular evaluations (e.g. all hyperparameters evaluated on all fidelities). We use a separate class than :class:`~syne_tune.blackbox_repository.BlackboxOffline`, as performance improvement can be made by avoiding to repeat hyperparameters and by storing all evaluations in a single table. Additional arguments on top of parent class :class:`~syne_tune.blackbox_repository.Blackbox`: :param hyperparameters: dataframe of hyperparameters, shape ``(num_evals, num_hps)``, columns must match hyperparameter names of ``configuration_space`` :param objectives_evaluations: values of recorded objectives, must have shape ``(num_evals, num_seeds, num_fidelities, num_objectives)`` :param fidelity_values: values of the ``num_fidelities`` fidelities, default to ``[1, ..., num_fidelities]`` """ def __init__( self, hyperparameters: pd.DataFrame, configuration_space: Dict[str, Any], fidelity_space: Dict[str, Any], objectives_evaluations: np.array, fidelity_values: Optional[np.array] = None, objectives_names: Optional[List[str]] = None, ): super(BlackboxTabular, self).__init__( configuration_space=configuration_space, fidelity_space=fidelity_space, objectives_names=objectives_names, ) assert len(fidelity_space) == 1, "Only a single fidelity supported for now" # todo missing-value support, should boils down to droping nans in ``hyperparameter_objectives_values`` num_hps = len(hyperparameters.columns) assert objectives_evaluations.ndim == 4 ( num_evals, num_seeds, num_fidelities, num_objectives, ) = objectives_evaluations.shape self.num_seeds = num_seeds self.num_fidelities = num_fidelities if fidelity_values is None: self._fidelity_values = np.arange(1, num_fidelities + 1) else: # assert sorted(fidelity_values.tolist()) == fidelity_values self._fidelity_values = fidelity_values # allows to retrieve the index in the objectives_evaluations of a given fidelity self.fidelity_map = { value: index for index, value in enumerate(self._fidelity_values) } self.hyperparameters = hyperparameters # builds a dataframe to retrieve in O(1) index given a hyperparameter, we could have use a dict but chose a # dataframe instead as 1) it is easier since the hyperparameters are itself given in a dataframe (otherwise # we would need to have hashable type from the dataframe value) 2) we can support in the future querying # multiple results at once efficiently self._hp_cols = list(hyperparameters.columns.values) self.hyperparameters_index = hyperparameters.copy() self.hyperparameters_index["index"] = hyperparameters.index self.hyperparameters_index.set_index(self._hp_cols, inplace=True) self.objectives_evaluations = objectives_evaluations if objectives_names is None: self.objectives_names = [f"y{i}" for i in range(num_objectives)] assert len(self.objectives_evaluations) == len(hyperparameters) assert len(fidelity_space) == 1, "only support single fidelity for now" assert ( max(self._fidelity_values) <= list(fidelity_space.values())[0].upper ), f"{max(self._fidelity_values)}, {max(next(iter(fidelity_space.values())).upper)}" assert len(hyperparameters) == len( hyperparameters.drop_duplicates() ), "some hps are duplicated, use a seed column" assert len(configuration_space) == num_hps for name in configuration_space.keys(): assert name in hyperparameters.columns assert len(self.objectives_names) == num_objectives def _objective_function( self, configuration: Union[dict, int], fidelity: Optional[dict] = None, seed: Optional[int] = None, ) -> ObjectiveFunctionResult: if seed is not None: assert 0 <= seed < self.num_seeds else: seed = np.random.randint(0, self.num_seeds) if not isinstance(configuration, dict): objectives_values = self.objectives_evaluations[configuration, seed, :, :] return objectives_values try: key = tuple(configuration[key] for key in self._hp_cols) matching_index = self.hyperparameters_index.loc[key].values except KeyError: raise ValueError( f"the hyperparameter {configuration} is not present in available evaluations. Use ``add_surrogate(blackbox)`` if" f" you want to add interpolation or a surrogate model that support querying any configuration." ) df_found = self.hyperparameters.loc[matching_index] assert len(df_found) == 1 index = df_found.index.values[0] if fidelity is None: # returns all fidelities objectives_values = self.objectives_evaluations[index, seed, :, :] return objectives_values else: fidelity_index = self.fidelity_map[list(fidelity.values())[0]] objectives_values = self.objectives_evaluations[ index, seed, fidelity_index, : ] return dict(zip(self.objectives_names, objectives_values)) @property def fidelity_values(self) -> np.array: return self._fidelity_values def _impute_objectives_values(self) -> Tuple[pd.DataFrame, np.array]: """Replaces nan values in objectives with first previous non-nan value. Time objective should be cumulative, otherwise each step will consume additional time. """ # Replace nan with previous value. Assumes that elapsed time is cumulative. objectives_evaluations = self.objectives_evaluations.copy() hyperparameters = self.hyperparameters.copy() ( num_configs, num_seeds, num_fidelities, num_objectives, ) = objectives_evaluations.shape for config_idx in range(num_configs): for seed_idx in range(num_seeds): for fidelity_idx in range(num_fidelities): for objective_idx in range(num_objectives): if np.isnan( objectives_evaluations[config_idx][seed_idx][fidelity_idx][ objective_idx ] ): objectives_evaluations[config_idx][seed_idx][fidelity_idx][ objective_idx ] = objectives_evaluations[config_idx][seed_idx][ fidelity_idx - 1 ][ objective_idx ] # Drop all hyperparameters with all nan objectives. nan_mask = np.isnan(objectives_evaluations).any((1, 2, 3)) hyperparameters = hyperparameters[~nan_mask] objectives_evaluations = objectives_evaluations[~nan_mask] return hyperparameters, objectives_evaluations # TODO: It is odd that ``y`` is transposed when compared to # ``objectives_evaluations``. Keep it this way, but it would be simpler # to understand if this was not done
[docs] def hyperparameter_objectives_values( self, predict_curves: bool = False ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ If ``predict_curves`` is False, the shape of ``X`` is ``(num_evals * num_seeds * num_fidelities, num_hps + 1)``, the shape of ``y`` is ``(num_evals * num_seeds * num_fidelities, num_objectives)``. This can be reshaped to ``(num_fidelities, num_seeds, num_evals, *)``. The final column of ``X`` is the fidelity value (only a single fidelity attribute is supported). If ``predict_curves`` is True, the shape of ``X`` is ``(num_evals * num_seeds, num_hps)``, the shape of ``y`` is ``(num_evals * num_seeds, num_fidelities * num_objectives)``. The latter can be reshaped to ``(num_seeds, num_evals, num_fidelities, num_objectives)``. :param predict_curves: See above. Default is ``False`` :return: Dataframes corresponding to ``X`` and ``y`` """ objectives_evaluations = self.objectives_evaluations hyperparameters = self.hyperparameters if np.isnan(np.sum(objectives_evaluations)): hyperparameters, objectives_evaluations = self._impute_objectives_values() if not predict_curves: Xs = [] fidelity_attr = list(self.fidelity_space.keys())[0] for fidelity_index, fidelity_value in enumerate(self.fidelity_values): X = hyperparameters.copy() X[fidelity_attr] = fidelity_value for seed in range(self.num_seeds): Xs.append(X) X = pd.concat(Xs, ignore_index=True) # y can be reshaped to # (num_fidelities, num_seeds, num_evals, num_objectives), while # objectives_evaluations has shape # (num_evals, num_seeds, num_fidelities, num_objectives) num_objectives = len(self.objectives_names) y = pd.DataFrame( data=objectives_evaluations.transpose((2, 1, 0, 3)).reshape( (-1, num_objectives) ), columns=self.objectives_names, ) else: Xs = [hyperparameters] * self.num_seeds X = pd.concat(Xs, ignore_index=True) # y can be reshaped to # (num_seeds, num_evals, num_fidelities, num_objectives) num_rows = objectives_evaluations.shape[0] * self.num_seeds y = pd.DataFrame( data=objectives_evaluations.transpose((1, 0, 2, 3)).reshape( (num_rows, -1) ) ) return X, y
[docs] def rename_objectives( self, objective_name_mapping: Dict[str, str] ) -> "BlackboxTabular": """ :param objective_name_mapping: dictionary from old objective name to new one, old objective name must be present in the blackbox :return: a blackbox with as many objectives as ``objective_name_mapping`` """ # todo add test for old_name in objective_name_mapping.keys(): assert old_name in self.objectives_names objective_indices = dict( zip(self.objectives_names, range(len(self.objectives_names))) ) new_objectives_indices = [ objective_indices[old_obj_name] for old_obj_name in objective_name_mapping.keys() ] return BlackboxTabular( hyperparameters=self.hyperparameters, configuration_space=self.configuration_space, fidelity_space=self.fidelity_space, objectives_evaluations=self.objectives_evaluations[ :, :, :, new_objectives_indices ], fidelity_values=self._fidelity_values, objectives_names=list(objective_name_mapping.values()), )
[docs] def all_configurations(self) -> List[Dict[str, Any]]: """ This method is useful in order to set ``restrict_configurations`` in :class:`~syne_tune.optimizer.schedulers.searchers.StochasticAndFilterDuplicatesSearcher` or :class:`~syne_tune.optimizer.schedulers.searchers.GPFIFOSearcher`, which restricts the searcher to only return configurations in this set. This allows you to use a tabular blackbox without a surrogate. :return: List of all hyperparameter configurations for which objective values can be returned """ return self.hyperparameters.to_dict("records")
def __str__(self): ( num_evals, num_seeds, num_fidelities, num_objectives, ) = self.objectives_evaluations.shape stats = { "total evaluations": self.objectives_evaluations.size // num_fidelities, "num fidelities": num_fidelities, "evaluated hps": num_evals, "seeds": num_seeds, "fidelities": num_fidelities, "objectives": self.objectives_names, "hyperparameter": list(self.configuration_space.keys()), } stats_str = ", ".join([f"{k}: {v}" for k, v in stats.items()]) return f"tabular blackbox: {stats_str}"
[docs] def serialize( bb_dict: Dict[str, BlackboxTabular], path: str, metadata: Optional[dict] = None ): # check all blackboxes share the same search space and have evaluated the same hyperparameters # pick an arbitrary blackbox bb_first = next(iter(bb_dict.values())) for bb in bb_dict.values(): pd.testing.assert_frame_equal(bb.hyperparameters, bb_first.hyperparameters) # assert bb.configuration_space == bb_first.configuration_space # assert bb.fidelity_space == bb_first.fidelity_space assert np.all(bb.fidelity_values == bb_first.fidelity_values) assert bb.objectives_names == bb_first.objectives_names assert bb.objectives_evaluations.shape == bb_first.objectives_evaluations.shape path = Path(path) path.mkdir(exist_ok=True) serialize_configspace( path=path, configuration_space=bb_first.configuration_space, fidelity_space=bb_first.fidelity_space, ) # we use gzip as snappy is not supported for fastparquet engine compression # gzip is slower than the default snappy but more compact bb_first.hyperparameters.to_parquet( path / "hyperparameters.parquet", index=False, compression="gzip", engine="fastparquet", ) with open(path / "objectives_evaluations.npy", "wb") as f: # (num_tasks, num_hps, num_seeds, num_fidelities, num_objectives) objectives = np.stack( [bb_dict[task].objectives_evaluations for task in bb_dict.keys()] ) np.save(f, objectives.astype(np.float32), allow_pickle=False) with open(path / "fidelities_values.npy", "wb") as f: np.save(f, bb_first.fidelity_values, allow_pickle=False) metadata = metadata.copy() if metadata else {} metadata.update( { "objectives_names": bb_first.objectives_names, "task_names": list(bb_dict.keys()), } ) serialize_metadata( path=path, metadata=metadata, )
[docs] def deserialize(path: str) -> Dict[str, BlackboxTabular]: """ Deserialize blackboxes contained in a path that were saved with :func:`serialize` above. TODO: the API is currently dissonant with :func:`serialize`, :func:`deserialize` for :class:`~syne_tune.blackbox_repository.BlackboxOffline` as ``serialize`` is a member function there. A possible way to unify is to have serialize also be a free function for ``BlackboxOffline``. :param path: a path that contains blackboxes that were saved with :func:`serialize` :return: a dictionary from task name to blackbox """ path = Path(path) configuration_space, fidelity_space = deserialize_configspace(path) hyperparameters = pd.read_parquet( Path(path) / "hyperparameters.parquet", engine="fastparquet" ) metadata = deserialize_metadata(path) objectives_names = metadata["objectives_names"] task_names = metadata["task_names"] with open(path / "fidelities_values.npy", "rb") as f: fidelity_values = np.load(f) # possibly we could use memmap to avoid memory use or speed-up loading times with open(path / "objectives_evaluations.npy", "rb") as f: objectives_evaluations = np.load(f) return { task: BlackboxTabular( hyperparameters=hyperparameters, configuration_space=configuration_space, fidelity_space=fidelity_space, objectives_evaluations=objectives_evaluations[i], fidelity_values=fidelity_values, objectives_names=objectives_names, ) for i, task in enumerate(task_names) }