Source code for syne_tune.blackbox_repository.conversion_scripts.scripts.yahpo_import

# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

"""
Wrap Surrogates from 
YAHPO Gym - An Efficient Multi-Objective Multi-Fidelity Benchmark for Hyperparameter Optimization
Florian Pfisterer, Lennart Schneider, Julia Moosbauer, Martin Binder, Bernd Bischl
"""
from typing import Optional, List, Dict, Any
import logging
import shutil

from yahpo_gym import benchmark_set
import numpy as np
import zipfile
from pathlib import Path

from syne_tune.blackbox_repository.conversion_scripts.blackbox_recipe import (
    BlackboxRecipe,
)
from syne_tune.blackbox_repository.conversion_scripts.scripts import (
    default_metric,
    metric_elapsed_time,
    resource_attr,
)
from syne_tune.blackbox_repository.conversion_scripts.utils import (
    repository_path,
    blackbox_local_path,
)
from syne_tune.blackbox_repository.serialize import (
    serialize_metadata,
)
import syne_tune.config_space as cs
from syne_tune.blackbox_repository.blackbox import Blackbox
from syne_tune.constants import ST_WORKER_ITER
from syne_tune.util import is_increasing, is_positive_integer

import ConfigSpace
from yahpo_gym.benchmark_set import BenchmarkSet
from yahpo_gym.configuration import list_scenarios
from yahpo_gym import local_config

logger = logging.getLogger(__name__)



[docs]
def download(target_path: Path, version: str):
    import urllib

    root = "https://github.com/slds-lmu/yahpo_data/archive/refs/tags/"

    target_file = target_path / f"yahpo_data-{version}"
    if not target_file.exists():
        logger.info(f"File {target_file} not found redownloading it.")
        urllib.request.urlretrieve(root + f"v{version}.zip", str(target_path) + ".zip")
        with zipfile.ZipFile(str(target_path) + ".zip", "r") as zip_ref:
            zip_ref.extractall(target_path)
    else:
        logger.info(f"File {target_file} found, skipping download.")



def _check_whether_iaml(benchmark: BenchmarkSet) -> bool:
    return benchmark.config.config_id.startswith("iaml_")


def _check_whether_rbv2(benchmark: BenchmarkSet) -> bool:
    return benchmark.config.config_id.startswith("rbv2_")


def _check_whether_nb301(benchmark: BenchmarkSet) -> bool:
    return benchmark.config.config_id == "nb301"


NB301_ATTRIBUTE_NAME_PREFIX = "NetworkSelectorDatasetInfo_COLON_darts_COLON_"



[docs]
class BlackBoxYAHPO(Blackbox):
    """
    A wrapper that allows putting a 'YAHPO' BenchmarkInstance into a Blackbox.

    If ``fidelities`` is given, it restricts ``fidelity_values`` to these values.
    The sequence must be positive int and increasing. This works only if there
    is a single fidelity attribute with integer values (but note that for
    some specific YAHPO benchmarks, a fractional fidelity is transformed to
    an integer one).

    Even though YAHPO interpolates between fidelities, it can make sense
    to restrict them to the values which have really been acquired in the
    data. Note that this restricts multi-fidelity schedulers like
    :class:`~syne_tune.optimizer.schedulers.HyperbandScheduler`, in that all
    their rungs levels have to be fidelity values.

    For example, for YAHPO ``iaml``, the fidelity ``trainsize`` has been
    acquired at [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1], this is transformed
    to [1, 2, 4, 8, 12, 16, 20]. By default, the fidelity is
    represented by ``cs.randint(1, 20)``, but if ``fidelities`` is passed,
    it uses ``cs.ordinal(fidelities)``.

    :param benchmark: YAHPO ``BenchmarkSet``
    :param fidelities: See above
    """

    def __init__(
        self,
        benchmark: BenchmarkSet,
        fidelities: Optional[List[int]] = None,
    ):
        self.benchmark = benchmark
        super(BlackBoxYAHPO, self).__init__(
            configuration_space=cs_to_synetune(
                self.benchmark.get_opt_space(drop_fidelity_params=True)
            ),
            fidelity_space=cs_to_synetune(self.benchmark.get_fidelity_space()),
            objectives_names=self.benchmark.config.y_names,
        )
        self.num_seeds = 1
        self._is_iaml = _check_whether_iaml(benchmark)
        self._is_rbv2 = _check_whether_rbv2(benchmark)
        self._is_nb301 = _check_whether_nb301(benchmark)
        if self._is_rbv2:
            self.configuration_space["repl"] = 10
        self._shortened_keys = None
        self._initialize_for_scenario()
        # Has to be called after ``_initialize_for_scenario``, in order to
        # transform fidelity space for some of the YAHPO scenarios
        self._adjust_fidelity_space(fidelities)
        self._fidelity_multiplier = 0.05 if self._is_iaml or self._is_rbv2 else 1

    def _initialize_for_scenario(self):
        if self._is_iaml or self._is_rbv2:
            # For ``iaml_``, the fidelity ``trainsize`` has been evaluated at values
            # [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1]. We multiply these values by 20
            # in order to obtain integers: [1, 2, 4, 8, 12, 16, 20]
            # For ``rbv2_``, the fidelity ``trainsize`` has been evaluated at values
            # [0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]. We
            # multiply these values by 20 in order to obtain integers:
            # [1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20]
            if self._is_iaml:
                assert len(self.fidelity_space) == 1
            domain = self.fidelity_space.get("trainsize")
            assert domain is not None
            assert isinstance(domain, cs.Float)
            assert domain.upper == 1 and domain.lower <= 0.05
            self.fidelity_space["trainsize"] = cs.randint(1, 20)
            if self._is_rbv2:
                # For ``rbv2_``, a second fidelity is ``repl``, but it is constant
                # 10, so can be removed
                assert len(self.fidelity_space) == 2
                assert "repl" in self.fidelity_space
                del self.fidelity_space["repl"]
        elif self._is_nb301:
            # Shorten overly long attribute names by removing the
            # common prefix
            len_prefix = len(NB301_ATTRIBUTE_NAME_PREFIX)
            shortened_keys = []

            def map_key(k: str) -> str:
                if k.startswith(NB301_ATTRIBUTE_NAME_PREFIX):
                    new_key = k[len_prefix:]
                    shortened_keys.append(new_key)
                    return new_key
                else:
                    return k

            self.configuration_space = {
                map_key(k): v for k, v in self.configuration_space.items()
            }
            self._shortened_keys = set(shortened_keys)

    def _adjust_fidelity_space(self, fidelities: Optional[List[int]]):
        assert len(self.fidelity_space) == 1, "Only one fidelity is supported"
        self._fidelity_name, domain = next(iter(self.fidelity_space.items()))
        assert (
            domain.value_type == int
        ), f"value_type of fidelity attribute must be int, but is {domain.value_type}"
        if fidelities is None:
            self._fidelity_values = np.arange(domain.lower, domain.upper + 1)
        else:
            assert is_increasing(fidelities) and is_positive_integer(
                fidelities
            ), f"fidelities = {fidelities} must be strictly increasing positive integers"
            assert (
                domain.lower <= fidelities[0] and fidelities[-1] <= domain.upper
            ), f"fidelities = {fidelities} must lie in [{domain.lower}, {domain.upper}]"
            self._fidelity_values = np.array(fidelities)
            self.fidelity_space[self._fidelity_name] = cs.ordinal(fidelities.copy())

    def _map_configuration(self, config: Dict[str, Any]) -> Dict[str, Any]:
        if self._is_nb301:

            def map_key(k: str) -> str:
                if k in self._shortened_keys:
                    return NB301_ATTRIBUTE_NAME_PREFIX + k
                else:
                    return k

            return {map_key(k): v for k, v in config.items()}
        else:
            return config

    def _prepare_yahpo_configuration(
        self, configuration: Dict[str, Any], fidelity: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Some of the hyperparameters are only active for certain values of other
        hyperparameters. We filter out the inactive ones, and add the fidelity to the
        configuration in order to interface with YAHPO.
        """
        configuration.update(fidelity)

        active_hyperparameters = self.benchmark.config_space.get_active_hyperparameters(
            ConfigSpace.Configuration(
                self.benchmark.config_space,
                values=configuration,
                allow_inactive_with_values=True,
            )
        )
        return {k: v for k, v in configuration.items() if k in active_hyperparameters}

    def _parse_fidelity(self, fidelity: Dict[str, Any]) -> Dict[str, Any]:
        if self._is_iaml or self._is_rbv2:
            k = "trainsize"
            fidelity_value = fidelity.get(k)
            assert (
                fidelity_value is not None
            ), f"fidelity = {fidelity} must contain key '{k}'"
            assert (
                fidelity_value in self.fidelity_values
            ), f"fidelity = {fidelity_value} not contained in {self.fidelity_values}"
            fidelity = {k: fidelity_value * self._fidelity_multiplier}
        return fidelity

    def _objective_function(
        self,
        configuration: Dict[str, Any],
        fidelity: Optional[dict] = None,
        seed: Optional[int] = None,
    ) -> Dict[str, Any]:
        configuration = self._map_configuration(configuration.copy())

        if fidelity is not None:
            configuration = self._prepare_yahpo_configuration(
                configuration, self._parse_fidelity(fidelity)
            )
            return self.benchmark.objective_function(configuration, seed=seed)[0]
        else:
            """
            copying the parent comment of the parent class:
            "not passing a fidelity is possible if either the blackbox does not have a fidelity space
            or if it has a single fidelity in its fidelity space. In the latter case, all fidelities are returned in form
            of a tensor with shape (num_fidelities, num_objectives)."
            This is used for efficiency (it is much faster to retrieve a full row in an array in term of read time).
            """
            # returns a tensor of shape (num_fidelities, num_objectives)
            num_fidelities = self.fidelity_values.size
            num_objectives = len(self.objectives_names)
            result = np.empty((num_fidelities, num_objectives))
            configs = [
                self._prepare_yahpo_configuration(
                    configuration,
                    {self._fidelity_name: fidelity * self._fidelity_multiplier},
                )
                for fidelity in self.fidelity_values
            ]
            result_dicts = self.benchmark.objective_function(configs, seed=seed)

            for i, result_dict in enumerate(result_dicts):
                result[i] = [
                    result_dict[objective] for objective in self.objectives_names
                ]

            return result


[docs]
    def set_instance(self, instance):
        """
        Set an instance for the underlying YAHPO Benchmark.
        """
        # Set the instance in the benchmark
        self.benchmark.set_instance(instance)
        # Update the configspace with the fixed instance
        if self.benchmark.config.instance_names:
            instance_names = self.benchmark.config.instance_names
        else:
            instance_names = "instance-names"
        self.configuration_space[instance_names] = instance
        return self


    @property
    def instances(self) -> np.array:
        return self.benchmark.instances

    @property
    def fidelity_values(self) -> np.array:
        return self._fidelity_values

    @property
    def time_attribute(self) -> str:
        """Name of the time column"""
        return self.benchmark.config.runtime_name




[docs]
def cs_to_synetune(config_space):
    """
    Convert ConfigSpace.ConfigSpace to a synetune configspace.

    TODO cover all possible hyperparameters of ConfigSpace.ConfigSpace, right now we only convert the one we need.
    """
    hps = config_space.get_hyperparameters()

    keys = []
    vals = []
    for a in hps:
        keys += [a.name]
        if isinstance(a, ConfigSpace.hyperparameters.CategoricalHyperparameter):
            if len(a.choices) > 1:
                val = cs.choice(a.choices)
            else:
                val = a.choices[0]
            vals += [val]
        elif isinstance(a, ConfigSpace.hyperparameters.Constant):
            vals += [a.value]
        elif isinstance(a, ConfigSpace.hyperparameters.UniformIntegerHyperparameter):
            if a.log:
                vals += [cs.lograndint(a.lower, a.upper)]
            else:
                vals += [cs.randint(a.lower, a.upper)]
        elif isinstance(a, ConfigSpace.hyperparameters.UniformFloatHyperparameter):
            if a.log:
                vals += [cs.loguniform(a.lower, a.upper)]
            else:
                vals += [cs.uniform(a.lower, a.upper)]
        else:
            raise ValueError(
                f"Hyperparameter {a.name} has type {type(a)} which is not supported in this converter."
            )
    # FIXME: This should also handle dependencies between hyperparameters.
    return dict(zip(keys, vals))




[docs]
def instantiate_yahpo(
    scenario: str,
    check: bool = False,
    fidelities: Optional[List[int]] = None,
):
    """
    Instantiates a dict of ``BlackBoxYAHPO``, one entry for each instance.

    :param scenario:
    :param check: If False, ``objective_function`` of the blackbox does not
        check whether the input configuration is valid. This is faster, but
        calls fail silently if configurations are invalid.
    :return:
    """
    prefix = "yahpo-"
    assert scenario.startswith(prefix)
    scenario = scenario[len(prefix) :]

    # Note: Yahpo expects to see tasks such as "rbv2_xgb" with specific folders under the data-path.
    # for this reason, we create all blackboxes under a subdir yahpo/ to avoid name clashes with other blackboxes
    # such as "fcnet" or "lcbench".
    local_config.init_config()
    local_config.set_data_path(str(repository_path / "yahpo"))

    # Select a Benchmark, active_session False because the ONNX session can not be serialized.
    bench = benchmark_set.BenchmarkSet(scenario, active_session=False)

    return {
        instance: BlackBoxYAHPO(
            BenchmarkSet(
                scenario, active_session=False, instance=instance, check=check
            ),
            fidelities=fidelities,
        )
        for instance in bench.instances
    }




[docs]
def serialize_yahpo(scenario: str, target_path: Path, version: str = "1.0"):
    assert scenario.startswith("yahpo-")
    scenario = scenario[6:]

    # download yahpo metadata and surrogate
    download(version=version, target_path=repository_path)

    # copy files to yahpo-scenario
    if target_path.exists():
        shutil.rmtree(target_path)
    shutil.copytree(
        str(repository_path / f"yahpo_data-{version}" / scenario), str(target_path)
    )

    # For now we only serialize metadata because everything else can be obtained from YAHPO.
    serialize_metadata(
        path=target_path,
        metadata={
            metric_elapsed_time: "time",
            default_metric: "val_accuracy",
            resource_attr: ST_WORKER_ITER,  # TODO, ressource not present, we can use ST_WORKER_ITER
        },
    )




[docs]
class YAHPORecipe(BlackboxRecipe):
    def __init__(self, name: str):
        assert name.startswith("yahpo-")
        self.scenario = name
        super(YAHPORecipe, self).__init__(
            name=name,
            cite_reference="YAHPO Gym - An Efficient Multi-Objective Multi-Fidelity Benchmark for Hyperparameter Optimization. "
            "Pfisterer F., Schneider S., Moosbauer J., Binder M., Bischl B., 2022",
        )

    def _generate_on_disk(self):
        # Note: Yahpo expects to see tasks such as "rbv2_xgb" with specific folders under the data-path.
        # for this reason, we create all blackboxes under a subdir yahpo/ to avoid name clashes with other blackboxes
        serialize_yahpo(
            self.scenario, target_path=blackbox_local_path(name=self.scenario)
        )



yahpo_scenarios = list_scenarios()


if __name__ == "__main__":
    root = logging.getLogger()
    root.setLevel(logging.INFO)
    scenario = "lcbench"

    YAHPORecipe(f"yahpo-{scenario}").generate()

    # plot one learning-curve for sanity-check
    from syne_tune.blackbox_repository import load_blackbox

    bb_dict = load_blackbox(f"yahpo-{scenario}", skip_if_present=False)
    first_task = next(iter(bb_dict.keys()))
    b = bb_dict[first_task]
    configuration = {k: v.sample() for k, v in b.configuration_space.items()}
    errors = []
    runtime = []

    import matplotlib.pyplot as plt

    for i in range(1, 52):
        res = b.objective_function(configuration=configuration, fidelity={"epoch": i})
        errors.append(res["val_accuracy"])
        runtime.append(res["time"])

    plt.plot(np.cumsum(runtime), errors)
    plt.show()