Source code for syne_tune.blackbox_repository.blackbox_tabular

# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from pathlib import Path
from typing import List, Dict, Optional, Tuple, Union, Any
import pandas as pd
import numpy as np

from syne_tune.blackbox_repository.blackbox import (
    Blackbox,
    ObjectiveFunctionResult,
)
from syne_tune.blackbox_repository.serialize import (
    serialize_configspace,
    deserialize_configspace,
    deserialize_metadata,
    serialize_metadata,
)



[docs]
class BlackboxTabular(Blackbox):
    """
    Blackbox that contains tabular evaluations (e.g. all hyperparameters
    evaluated on all fidelities). We use a separate class than
    :class:`~syne_tune.blackbox_repository.BlackboxOffline`, as performance
    improvement can be made by avoiding to repeat hyperparameters and by storing
    all evaluations in a single table.

    Additional arguments on top of parent class
    :class:`~syne_tune.blackbox_repository.Blackbox`:

    :param hyperparameters: dataframe of hyperparameters, shape
        ``(num_evals, num_hps)``, columns must match hyperparameter names of
        ``configuration_space``
    :param objectives_evaluations: values of recorded objectives, must have
        shape ``(num_evals, num_seeds, num_fidelities, num_objectives)``
    :param fidelity_values: values of the ``num_fidelities`` fidelities, default
        to ``[1, ..., num_fidelities]``
    """

    def __init__(
        self,
        hyperparameters: pd.DataFrame,
        configuration_space: Dict[str, Any],
        fidelity_space: Dict[str, Any],
        objectives_evaluations: np.array,
        fidelity_values: Optional[np.array] = None,
        objectives_names: Optional[List[str]] = None,
    ):
        super(BlackboxTabular, self).__init__(
            configuration_space=configuration_space,
            fidelity_space=fidelity_space,
            objectives_names=objectives_names,
        )
        assert len(fidelity_space) == 1, "Only a single fidelity supported for now"
        # todo missing-value support, should boils down to droping nans in ``hyperparameter_objectives_values``
        num_hps = len(hyperparameters.columns)

        assert objectives_evaluations.ndim == 4
        (
            num_evals,
            num_seeds,
            num_fidelities,
            num_objectives,
        ) = objectives_evaluations.shape

        self.num_seeds = num_seeds
        self.num_fidelities = num_fidelities
        if fidelity_values is None:
            self._fidelity_values = np.arange(1, num_fidelities + 1)
        else:
            # assert sorted(fidelity_values.tolist()) == fidelity_values
            self._fidelity_values = fidelity_values

        # allows to retrieve the index in the objectives_evaluations of a given fidelity
        self.fidelity_map = {
            value: index for index, value in enumerate(self._fidelity_values)
        }
        self.hyperparameters = hyperparameters

        # builds a dataframe to retrieve in O(1) index given a hyperparameter, we could have use a dict but chose a
        # dataframe instead as 1) it is easier since the hyperparameters are itself given in a dataframe (otherwise
        # we would need to have hashable type from the dataframe value) 2) we can support in the future querying
        # multiple results at once efficiently
        self._hp_cols = list(hyperparameters.columns.values)
        self.hyperparameters_index = hyperparameters.copy()
        self.hyperparameters_index["index"] = hyperparameters.index
        self.hyperparameters_index.set_index(self._hp_cols, inplace=True)

        self.objectives_evaluations = objectives_evaluations
        if objectives_names is None:
            self.objectives_names = [f"y{i}" for i in range(num_objectives)]

        assert len(self.objectives_evaluations) == len(hyperparameters)
        assert len(fidelity_space) == 1, "only support single fidelity for now"
        assert (
            max(self._fidelity_values) <= list(fidelity_space.values())[0].upper
        ), f"{max(self._fidelity_values)}, {max(next(iter(fidelity_space.values())).upper)}"
        assert len(hyperparameters) == len(
            hyperparameters.drop_duplicates()
        ), "some hps are duplicated, use a seed column"
        assert len(configuration_space) == num_hps
        for name in configuration_space.keys():
            assert name in hyperparameters.columns

        assert len(self.objectives_names) == num_objectives

    def _objective_function(
        self,
        configuration: Union[dict, int],
        fidelity: Optional[dict] = None,
        seed: Optional[int] = None,
    ) -> ObjectiveFunctionResult:
        if seed is not None:
            assert 0 <= seed < self.num_seeds
        else:
            seed = np.random.randint(0, self.num_seeds)
        if not isinstance(configuration, dict):
            objectives_values = self.objectives_evaluations[configuration, seed, :, :]
            return objectives_values
        try:
            key = tuple(configuration[key] for key in self._hp_cols)
            matching_index = self.hyperparameters_index.loc[key].values
        except KeyError:
            raise ValueError(
                f"the hyperparameter {configuration} is not present in available evaluations. Use ``add_surrogate(blackbox)`` if"
                f" you want to add interpolation or a surrogate model that support querying any configuration."
            )

        df_found = self.hyperparameters.loc[matching_index]
        assert len(df_found) == 1
        index = df_found.index.values[0]

        if fidelity is None:
            # returns all fidelities
            objectives_values = self.objectives_evaluations[index, seed, :, :]
            return objectives_values
        else:
            fidelity_index = self.fidelity_map[list(fidelity.values())[0]]
            objectives_values = self.objectives_evaluations[
                index, seed, fidelity_index, :
            ]
            return dict(zip(self.objectives_names, objectives_values))

    @property
    def fidelity_values(self) -> np.array:
        return self._fidelity_values

    def _impute_objectives_values(self) -> Tuple[pd.DataFrame, np.array]:
        """Replaces nan values in objectives with first previous non-nan value.

        Time objective should be cumulative, otherwise each step will consume additional time.
        """
        # Replace nan with previous value. Assumes that elapsed time is cumulative.
        objectives_evaluations = self.objectives_evaluations.copy()
        hyperparameters = self.hyperparameters.copy()
        (
            num_configs,
            num_seeds,
            num_fidelities,
            num_objectives,
        ) = objectives_evaluations.shape
        for config_idx in range(num_configs):
            for seed_idx in range(num_seeds):
                for fidelity_idx in range(num_fidelities):
                    for objective_idx in range(num_objectives):
                        if np.isnan(
                            objectives_evaluations[config_idx][seed_idx][fidelity_idx][
                                objective_idx
                            ]
                        ):
                            objectives_evaluations[config_idx][seed_idx][fidelity_idx][
                                objective_idx
                            ] = objectives_evaluations[config_idx][seed_idx][
                                fidelity_idx - 1
                            ][
                                objective_idx
                            ]
        # Drop all hyperparameters with all nan objectives.
        nan_mask = np.isnan(objectives_evaluations).any((1, 2, 3))
        hyperparameters = hyperparameters[~nan_mask]
        objectives_evaluations = objectives_evaluations[~nan_mask]
        return hyperparameters, objectives_evaluations

    # TODO: It is odd that ``y`` is transposed when compared to
    # ``objectives_evaluations``. Keep it this way, but it would be simpler
    # to understand if this was not done

[docs]
    def hyperparameter_objectives_values(
        self, predict_curves: bool = False
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        If ``predict_curves`` is False, the shape of ``X`` is
        ``(num_evals * num_seeds * num_fidelities, num_hps + 1)``, the shape of ``y``
        is ``(num_evals * num_seeds * num_fidelities, num_objectives)``.
        This can be reshaped to ``(num_fidelities, num_seeds, num_evals, *)``.
        The final column of ``X`` is the fidelity value (only a single fidelity
        attribute is supported).

        If ``predict_curves`` is True, the shape of ``X`` is
        ``(num_evals * num_seeds, num_hps)``, the shape of ``y`` is
        ``(num_evals * num_seeds, num_fidelities * num_objectives)``. The latter
        can be reshaped to
        ``(num_seeds, num_evals, num_fidelities, num_objectives)``.

        :param predict_curves: See above. Default is ``False``
        :return: Dataframes corresponding to ``X`` and ``y``
        """
        objectives_evaluations = self.objectives_evaluations
        hyperparameters = self.hyperparameters
        if np.isnan(np.sum(objectives_evaluations)):
            hyperparameters, objectives_evaluations = self._impute_objectives_values()

        if not predict_curves:
            Xs = []
            fidelity_attr = list(self.fidelity_space.keys())[0]
            for fidelity_index, fidelity_value in enumerate(self.fidelity_values):
                X = hyperparameters.copy()
                X[fidelity_attr] = fidelity_value
                for seed in range(self.num_seeds):
                    Xs.append(X)
            X = pd.concat(Xs, ignore_index=True)
            # y can be reshaped to
            # (num_fidelities, num_seeds, num_evals, num_objectives), while
            # objectives_evaluations has shape
            # (num_evals, num_seeds, num_fidelities, num_objectives)
            num_objectives = len(self.objectives_names)
            y = pd.DataFrame(
                data=objectives_evaluations.transpose((2, 1, 0, 3)).reshape(
                    (-1, num_objectives)
                ),
                columns=self.objectives_names,
            )
        else:
            Xs = [hyperparameters] * self.num_seeds
            X = pd.concat(Xs, ignore_index=True)
            # y can be reshaped to
            # (num_seeds, num_evals, num_fidelities, num_objectives)
            num_rows = objectives_evaluations.shape[0] * self.num_seeds
            y = pd.DataFrame(
                data=objectives_evaluations.transpose((1, 0, 2, 3)).reshape(
                    (num_rows, -1)
                )
            )
        return X, y



[docs]
    def rename_objectives(
        self, objective_name_mapping: Dict[str, str]
    ) -> "BlackboxTabular":
        """
        :param objective_name_mapping: dictionary from old objective name to
            new one, old objective name must be present in the blackbox
        :return: a blackbox with as many objectives as ``objective_name_mapping``
        """
        # todo add test
        for old_name in objective_name_mapping.keys():
            assert old_name in self.objectives_names
        objective_indices = dict(
            zip(self.objectives_names, range(len(self.objectives_names)))
        )
        new_objectives_indices = [
            objective_indices[old_obj_name]
            for old_obj_name in objective_name_mapping.keys()
        ]
        return BlackboxTabular(
            hyperparameters=self.hyperparameters,
            configuration_space=self.configuration_space,
            fidelity_space=self.fidelity_space,
            objectives_evaluations=self.objectives_evaluations[
                :, :, :, new_objectives_indices
            ],
            fidelity_values=self._fidelity_values,
            objectives_names=list(objective_name_mapping.values()),
        )



[docs]
    def all_configurations(self) -> List[Dict[str, Any]]:
        """
        This method is useful in order to set ``restrict_configurations`` in
        :class:`~syne_tune.optimizer.schedulers.searchers.StochasticAndFilterDuplicatesSearcher`
        or
        :class:`~syne_tune.optimizer.schedulers.searchers.GPFIFOSearcher`,
        which restricts the searcher to only return configurations in this set.
        This allows you to use a tabular blackbox without a surrogate.

        :return: List of all hyperparameter configurations for which objective
            values can be returned
        """
        return self.hyperparameters.to_dict("records")


    def __str__(self):
        (
            num_evals,
            num_seeds,
            num_fidelities,
            num_objectives,
        ) = self.objectives_evaluations.shape
        stats = {
            "total evaluations": self.objectives_evaluations.size // num_fidelities,
            "num fidelities": num_fidelities,
            "evaluated hps": num_evals,
            "seeds": num_seeds,
            "fidelities": num_fidelities,
            "objectives": self.objectives_names,
            "hyperparameter": list(self.configuration_space.keys()),
        }
        stats_str = ", ".join([f"{k}: {v}" for k, v in stats.items()])
        return f"tabular blackbox: {stats_str}"




[docs]
def serialize(
    bb_dict: Dict[str, BlackboxTabular], path: str, metadata: Optional[dict] = None
):
    # check all blackboxes share the same search space and have evaluated the same hyperparameters
    # pick an arbitrary blackbox
    bb_first = next(iter(bb_dict.values()))
    for bb in bb_dict.values():
        pd.testing.assert_frame_equal(bb.hyperparameters, bb_first.hyperparameters)
        # assert bb.configuration_space == bb_first.configuration_space
        # assert bb.fidelity_space == bb_first.fidelity_space
        assert np.all(bb.fidelity_values == bb_first.fidelity_values)
        assert bb.objectives_names == bb_first.objectives_names
        assert bb.objectives_evaluations.shape == bb_first.objectives_evaluations.shape

    path = Path(path)

    path.mkdir(exist_ok=True)

    serialize_configspace(
        path=path,
        configuration_space=bb_first.configuration_space,
        fidelity_space=bb_first.fidelity_space,
    )

    # we use gzip as snappy is not supported for fastparquet engine compression
    # gzip is slower than the default snappy but more compact
    bb_first.hyperparameters.to_parquet(
        path / "hyperparameters.parquet",
        index=False,
        compression="gzip",
        engine="fastparquet",
    )

    with open(path / "objectives_evaluations.npy", "wb") as f:
        # (num_tasks, num_hps, num_seeds, num_fidelities, num_objectives)
        objectives = np.stack(
            [bb_dict[task].objectives_evaluations for task in bb_dict.keys()]
        )
        np.save(f, objectives.astype(np.float32), allow_pickle=False)

    with open(path / "fidelities_values.npy", "wb") as f:
        np.save(f, bb_first.fidelity_values, allow_pickle=False)

    metadata = metadata.copy() if metadata else {}
    metadata.update(
        {
            "objectives_names": bb_first.objectives_names,
            "task_names": list(bb_dict.keys()),
        }
    )
    serialize_metadata(
        path=path,
        metadata=metadata,
    )




[docs]
def deserialize(path: str) -> Dict[str, BlackboxTabular]:
    """
    Deserialize blackboxes contained in a path that were saved with :func:`serialize`
    above.

    TODO: the API is currently dissonant with :func:`serialize`,
    :func:`deserialize` for :class:`~syne_tune.blackbox_repository.BlackboxOffline`
    as ``serialize`` is a member function there. A possible way to unify is to
    have serialize also be a free function for ``BlackboxOffline``.

    :param path: a path that contains blackboxes that were saved with
        :func:`serialize`
    :return: a dictionary from task name to blackbox
    """
    path = Path(path)

    configuration_space, fidelity_space = deserialize_configspace(path)
    hyperparameters = pd.read_parquet(
        Path(path) / "hyperparameters.parquet", engine="fastparquet"
    )

    metadata = deserialize_metadata(path)
    objectives_names = metadata["objectives_names"]
    task_names = metadata["task_names"]

    with open(path / "fidelities_values.npy", "rb") as f:
        fidelity_values = np.load(f)

    # possibly we could use memmap to avoid memory use or speed-up loading times
    with open(path / "objectives_evaluations.npy", "rb") as f:
        objectives_evaluations = np.load(f)

    return {
        task: BlackboxTabular(
            hyperparameters=hyperparameters,
            configuration_space=configuration_space,
            fidelity_space=fidelity_space,
            objectives_evaluations=objectives_evaluations[i],
            fidelity_values=fidelity_values,
            objectives_names=objectives_names,
        )
        for i, task in enumerate(task_names)
    }