Source code for syne_tune.blackbox_repository.conversion_scripts.scripts.yahpo_import

# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

"""
Wrap Surrogates from 
YAHPO Gym - An Efficient Multi-Objective Multi-Fidelity Benchmark for Hyperparameter Optimization
Florian Pfisterer, Lennart Schneider, Julia Moosbauer, Martin Binder, Bernd Bischl
"""
from typing import Optional, List, Dict, Any
import logging
import shutil

from yahpo_gym import benchmark_set
import numpy as np
import zipfile
from pathlib import Path

from syne_tune.blackbox_repository.conversion_scripts.blackbox_recipe import (
    BlackboxRecipe,
)
from syne_tune.blackbox_repository.conversion_scripts.scripts import (
    default_metric,
    metric_elapsed_time,
    resource_attr,
)
from syne_tune.blackbox_repository.conversion_scripts.utils import (
    repository_path,
    blackbox_local_path,
)
from syne_tune.blackbox_repository.serialize import (
    serialize_metadata,
)
import syne_tune.config_space as cs
from syne_tune.blackbox_repository.blackbox import Blackbox
from syne_tune.constants import ST_WORKER_ITER
from syne_tune.util import is_increasing, is_positive_integer

import ConfigSpace
from yahpo_gym.benchmark_set import BenchmarkSet
from yahpo_gym.configuration import list_scenarios
from yahpo_gym import local_config

logger = logging.getLogger(__name__)


[docs] def download(target_path: Path, version: str): import urllib root = "https://github.com/slds-lmu/yahpo_data/archive/refs/tags/" target_file = target_path / f"yahpo_data-{version}" if not target_file.exists(): logger.info(f"File {target_file} not found redownloading it.") urllib.request.urlretrieve(root + f"v{version}.zip", str(target_path) + ".zip") with zipfile.ZipFile(str(target_path) + ".zip", "r") as zip_ref: zip_ref.extractall(target_path) else: logger.info(f"File {target_file} found, skipping download.")
def _check_whether_iaml(benchmark: BenchmarkSet) -> bool: return benchmark.config.config_id.startswith("iaml_") def _check_whether_rbv2(benchmark: BenchmarkSet) -> bool: return benchmark.config.config_id.startswith("rbv2_") def _check_whether_nb301(benchmark: BenchmarkSet) -> bool: return benchmark.config.config_id == "nb301" NB301_ATTRIBUTE_NAME_PREFIX = "NetworkSelectorDatasetInfo_COLON_darts_COLON_"
[docs] class BlackBoxYAHPO(Blackbox): """ A wrapper that allows putting a 'YAHPO' BenchmarkInstance into a Blackbox. If ``fidelities`` is given, it restricts ``fidelity_values`` to these values. The sequence must be positive int and increasing. This works only if there is a single fidelity attribute with integer values (but note that for some specific YAHPO benchmarks, a fractional fidelity is transformed to an integer one). Even though YAHPO interpolates between fidelities, it can make sense to restrict them to the values which have really been acquired in the data. Note that this restricts multi-fidelity schedulers like :class:`~syne_tune.optimizer.schedulers.HyperbandScheduler`, in that all their rungs levels have to be fidelity values. For example, for YAHPO ``iaml``, the fidelity ``trainsize`` has been acquired at [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1], this is transformed to [1, 2, 4, 8, 12, 16, 20]. By default, the fidelity is represented by ``cs.randint(1, 20)``, but if ``fidelities`` is passed, it uses ``cs.ordinal(fidelities)``. :param benchmark: YAHPO ``BenchmarkSet`` :param fidelities: See above """ def __init__( self, benchmark: BenchmarkSet, fidelities: Optional[List[int]] = None, ): self.benchmark = benchmark super(BlackBoxYAHPO, self).__init__( configuration_space=cs_to_synetune( self.benchmark.get_opt_space(drop_fidelity_params=True) ), fidelity_space=cs_to_synetune(self.benchmark.get_fidelity_space()), objectives_names=self.benchmark.config.y_names, ) self.num_seeds = 1 self._is_iaml = _check_whether_iaml(benchmark) self._is_rbv2 = _check_whether_rbv2(benchmark) self._is_nb301 = _check_whether_nb301(benchmark) if self._is_rbv2: self.configuration_space["repl"] = 10 self._shortened_keys = None self._initialize_for_scenario() # Has to be called after ``_initialize_for_scenario``, in order to # transform fidelity space for some of the YAHPO scenarios self._adjust_fidelity_space(fidelities) self._fidelity_multiplier = 0.05 if self._is_iaml or self._is_rbv2 else 1 def _initialize_for_scenario(self): if self._is_iaml or self._is_rbv2: # For ``iaml_``, the fidelity ``trainsize`` has been evaluated at values # [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1]. We multiply these values by 20 # in order to obtain integers: [1, 2, 4, 8, 12, 16, 20] # For ``rbv2_``, the fidelity ``trainsize`` has been evaluated at values # [0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]. We # multiply these values by 20 in order to obtain integers: # [1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20] if self._is_iaml: assert len(self.fidelity_space) == 1 domain = self.fidelity_space.get("trainsize") assert domain is not None assert isinstance(domain, cs.Float) assert domain.upper == 1 and domain.lower <= 0.05 self.fidelity_space["trainsize"] = cs.randint(1, 20) if self._is_rbv2: # For ``rbv2_``, a second fidelity is ``repl``, but it is constant # 10, so can be removed assert len(self.fidelity_space) == 2 assert "repl" in self.fidelity_space del self.fidelity_space["repl"] elif self._is_nb301: # Shorten overly long attribute names by removing the # common prefix len_prefix = len(NB301_ATTRIBUTE_NAME_PREFIX) shortened_keys = [] def map_key(k: str) -> str: if k.startswith(NB301_ATTRIBUTE_NAME_PREFIX): new_key = k[len_prefix:] shortened_keys.append(new_key) return new_key else: return k self.configuration_space = { map_key(k): v for k, v in self.configuration_space.items() } self._shortened_keys = set(shortened_keys) def _adjust_fidelity_space(self, fidelities: Optional[List[int]]): assert len(self.fidelity_space) == 1, "Only one fidelity is supported" self._fidelity_name, domain = next(iter(self.fidelity_space.items())) assert ( domain.value_type == int ), f"value_type of fidelity attribute must be int, but is {domain.value_type}" if fidelities is None: self._fidelity_values = np.arange(domain.lower, domain.upper + 1) else: assert is_increasing(fidelities) and is_positive_integer( fidelities ), f"fidelities = {fidelities} must be strictly increasing positive integers" assert ( domain.lower <= fidelities[0] and fidelities[-1] <= domain.upper ), f"fidelities = {fidelities} must lie in [{domain.lower}, {domain.upper}]" self._fidelity_values = np.array(fidelities) self.fidelity_space[self._fidelity_name] = cs.ordinal(fidelities.copy()) def _map_configuration(self, config: Dict[str, Any]) -> Dict[str, Any]: if self._is_nb301: def map_key(k: str) -> str: if k in self._shortened_keys: return NB301_ATTRIBUTE_NAME_PREFIX + k else: return k return {map_key(k): v for k, v in config.items()} else: return config def _prepare_yahpo_configuration( self, configuration: Dict[str, Any], fidelity: Dict[str, Any] ) -> Dict[str, Any]: """Some of the hyperparameters are only active for certain values of other hyperparameters. We filter out the inactive ones, and add the fidelity to the configuration in order to interface with YAHPO. """ configuration.update(fidelity) active_hyperparameters = self.benchmark.config_space.get_active_hyperparameters( ConfigSpace.Configuration( self.benchmark.config_space, values=configuration, allow_inactive_with_values=True, ) ) return {k: v for k, v in configuration.items() if k in active_hyperparameters} def _parse_fidelity(self, fidelity: Dict[str, Any]) -> Dict[str, Any]: if self._is_iaml or self._is_rbv2: k = "trainsize" fidelity_value = fidelity.get(k) assert ( fidelity_value is not None ), f"fidelity = {fidelity} must contain key '{k}'" assert ( fidelity_value in self.fidelity_values ), f"fidelity = {fidelity_value} not contained in {self.fidelity_values}" fidelity = {k: fidelity_value * self._fidelity_multiplier} return fidelity def _objective_function( self, configuration: Dict[str, Any], fidelity: Optional[dict] = None, seed: Optional[int] = None, ) -> Dict[str, Any]: configuration = self._map_configuration(configuration.copy()) if fidelity is not None: configuration = self._prepare_yahpo_configuration( configuration, self._parse_fidelity(fidelity) ) return self.benchmark.objective_function(configuration, seed=seed)[0] else: """ copying the parent comment of the parent class: "not passing a fidelity is possible if either the blackbox does not have a fidelity space or if it has a single fidelity in its fidelity space. In the latter case, all fidelities are returned in form of a tensor with shape (num_fidelities, num_objectives)." This is used for efficiency (it is much faster to retrieve a full row in an array in term of read time). """ # returns a tensor of shape (num_fidelities, num_objectives) num_fidelities = self.fidelity_values.size num_objectives = len(self.objectives_names) result = np.empty((num_fidelities, num_objectives)) configs = [ self._prepare_yahpo_configuration( configuration, {self._fidelity_name: fidelity * self._fidelity_multiplier}, ) for fidelity in self.fidelity_values ] result_dicts = self.benchmark.objective_function(configs, seed=seed) for i, result_dict in enumerate(result_dicts): result[i] = [ result_dict[objective] for objective in self.objectives_names ] return result
[docs] def set_instance(self, instance): """ Set an instance for the underlying YAHPO Benchmark. """ # Set the instance in the benchmark self.benchmark.set_instance(instance) # Update the configspace with the fixed instance if self.benchmark.config.instance_names: instance_names = self.benchmark.config.instance_names else: instance_names = "instance-names" self.configuration_space[instance_names] = instance return self
@property def instances(self) -> np.array: return self.benchmark.instances @property def fidelity_values(self) -> np.array: return self._fidelity_values @property def time_attribute(self) -> str: """Name of the time column""" return self.benchmark.config.runtime_name
[docs] def cs_to_synetune(config_space): """ Convert ConfigSpace.ConfigSpace to a synetune configspace. TODO cover all possible hyperparameters of ConfigSpace.ConfigSpace, right now we only convert the one we need. """ hps = config_space.get_hyperparameters() keys = [] vals = [] for a in hps: keys += [a.name] if isinstance(a, ConfigSpace.hyperparameters.CategoricalHyperparameter): if len(a.choices) > 1: val = cs.choice(a.choices) else: val = a.choices[0] vals += [val] elif isinstance(a, ConfigSpace.hyperparameters.Constant): vals += [a.value] elif isinstance(a, ConfigSpace.hyperparameters.UniformIntegerHyperparameter): if a.log: vals += [cs.lograndint(a.lower, a.upper)] else: vals += [cs.randint(a.lower, a.upper)] elif isinstance(a, ConfigSpace.hyperparameters.UniformFloatHyperparameter): if a.log: vals += [cs.loguniform(a.lower, a.upper)] else: vals += [cs.uniform(a.lower, a.upper)] else: raise ValueError( f"Hyperparameter {a.name} has type {type(a)} which is not supported in this converter." ) # FIXME: This should also handle dependencies between hyperparameters. return dict(zip(keys, vals))
[docs] def instantiate_yahpo( scenario: str, check: bool = False, fidelities: Optional[List[int]] = None, ): """ Instantiates a dict of ``BlackBoxYAHPO``, one entry for each instance. :param scenario: :param check: If False, ``objective_function`` of the blackbox does not check whether the input configuration is valid. This is faster, but calls fail silently if configurations are invalid. :return: """ prefix = "yahpo-" assert scenario.startswith(prefix) scenario = scenario[len(prefix) :] # Note: Yahpo expects to see tasks such as "rbv2_xgb" with specific folders under the data-path. # for this reason, we create all blackboxes under a subdir yahpo/ to avoid name clashes with other blackboxes # such as "fcnet" or "lcbench". local_config.init_config() local_config.set_data_path(str(repository_path / "yahpo")) # Select a Benchmark, active_session False because the ONNX session can not be serialized. bench = benchmark_set.BenchmarkSet(scenario, active_session=False) return { instance: BlackBoxYAHPO( BenchmarkSet( scenario, active_session=False, instance=instance, check=check ), fidelities=fidelities, ) for instance in bench.instances }
[docs] def serialize_yahpo(scenario: str, target_path: Path, version: str = "1.0"): assert scenario.startswith("yahpo-") scenario = scenario[6:] # download yahpo metadata and surrogate download(version=version, target_path=repository_path) # copy files to yahpo-scenario if target_path.exists(): shutil.rmtree(target_path) shutil.copytree( str(repository_path / f"yahpo_data-{version}" / scenario), str(target_path) ) # For now we only serialize metadata because everything else can be obtained from YAHPO. serialize_metadata( path=target_path, metadata={ metric_elapsed_time: "time", default_metric: "val_accuracy", resource_attr: ST_WORKER_ITER, # TODO, ressource not present, we can use ST_WORKER_ITER }, )
[docs] class YAHPORecipe(BlackboxRecipe): def __init__(self, name: str): assert name.startswith("yahpo-") self.scenario = name super(YAHPORecipe, self).__init__( name=name, cite_reference="YAHPO Gym - An Efficient Multi-Objective Multi-Fidelity Benchmark for Hyperparameter Optimization. " "Pfisterer F., Schneider S., Moosbauer J., Binder M., Bischl B., 2022", ) def _generate_on_disk(self): # Note: Yahpo expects to see tasks such as "rbv2_xgb" with specific folders under the data-path. # for this reason, we create all blackboxes under a subdir yahpo/ to avoid name clashes with other blackboxes serialize_yahpo( self.scenario, target_path=blackbox_local_path(name=self.scenario) )
yahpo_scenarios = list_scenarios() if __name__ == "__main__": root = logging.getLogger() root.setLevel(logging.INFO) scenario = "lcbench" YAHPORecipe(f"yahpo-{scenario}").generate() # plot one learning-curve for sanity-check from syne_tune.blackbox_repository import load_blackbox bb_dict = load_blackbox(f"yahpo-{scenario}", skip_if_present=False) first_task = next(iter(bb_dict.keys())) b = bb_dict[first_task] configuration = {k: v.sample() for k, v in b.configuration_space.items()} errors = [] runtime = [] import matplotlib.pyplot as plt for i in range(1, 52): res = b.objective_function(configuration=configuration, fidelity={"epoch": i}) errors.append(res["val_accuracy"]) runtime.append(res["time"]) plt.plot(np.cumsum(runtime), errors) plt.show()