Source code for syne_tune.blackbox_repository.conversion_scripts.scripts.lcbench.lcbench

# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import zipfile
import urllib

import pandas as pd
import numpy as np

from syne_tune.blackbox_repository.blackbox_tabular import serialize, BlackboxTabular
from syne_tune.blackbox_repository.conversion_scripts.blackbox_recipe import (
    BlackboxRecipe,
)
from syne_tune.blackbox_repository.conversion_scripts.scripts import (
    metric_elapsed_time,
    default_metric,
    resource_attr,
)
from syne_tune.config_space import randint, lograndint, uniform, loguniform
from syne_tune.util import catchtime
from syne_tune.blackbox_repository.conversion_scripts.scripts.lcbench.api import (
    Benchmark,
)
from syne_tune.blackbox_repository.conversion_scripts.utils import repository_path


BLACKBOX_NAME = "lcbench"

METRIC_ACCURACY = "val_accuracy"

METRIC_ELAPSED_TIME = "time"

RESOURCE_ATTR = "epoch"

MAX_RESOURCE_LEVEL = 50

SHA256_HASH = "5461a65e6b94a877fd3214b4ef50436e25127f73303bf3e77f5db59289ecc670"

CONFIGURATION_SPACE = {
    "num_layers": randint(1, 5),
    "max_units": lograndint(64, 1024),
    "batch_size": lograndint(16, 512),
    "learning_rate": loguniform(1e-4, 1e-1),
    "weight_decay": uniform(1e-5, 1e-1),
    "momentum": uniform(0.1, 0.99),
    "max_dropout": uniform(0.0, 1.0),
}


[docs] def convert_task(bench, dataset_name): n_config = 2000 configs = [ bench.query(dataset_name=dataset_name, tag="config", config_id=i) for i in range(n_config) ] hyperparameters = pd.DataFrame(configs) # remove constant columns hyperparameters = hyperparameters.loc[ :, (hyperparameters != hyperparameters.iloc[0]).any() ] objectives = [ "Train/val_accuracy", "Train/val_balanced_accuracy", "Train/val_cross_entropy", "Train/test_result", "Train/test_balanced_accuracy", "Train/test_cross_entropy", "time", ] objectives_evaluations = np.zeros( (n_config, 1, MAX_RESOURCE_LEVEL, len(objectives)) ) fidelity_space = {RESOURCE_ATTR: randint(lower=1, upper=MAX_RESOURCE_LEVEL)} for j, tag in enumerate(objectives): for i in range(n_config): # Drop first evaluation (before training) as well as last. raw_objective_evaluations = bench.query( dataset_name=dataset_name, tag=tag, config_id=i ) objectives_evaluations[i, 0, :, j] = raw_objective_evaluations[1:-1] if tag == "time": # Remove time for scoring the model before training it objectives_evaluations[i, 0, :, j] -= raw_objective_evaluations[0] return BlackboxTabular( hyperparameters=hyperparameters, configuration_space=CONFIGURATION_SPACE, fidelity_space=fidelity_space, objectives_evaluations=objectives_evaluations, fidelity_values=np.arange(1, MAX_RESOURCE_LEVEL + 1), objectives_names=[ "val_accuracy", "val_balanced_accuracy", "val_cross_entropy", "test_accuracy", "test_balanced_accuracy", "test_cross_entropy", "time", ], )
[docs] class LCBenchRecipe(BlackboxRecipe): def __init__(self): super(LCBenchRecipe, self).__init__( name=BLACKBOX_NAME, hash=SHA256_HASH, cite_reference="Auto-PyTorch: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL. " "Lucas Zimmer, Marius Lindauer, Frank Hutter. 2020.", ) def _generate_on_disk(self): data_file = repository_path / "data_2k_lw.zip" if not data_file.exists(): src = "https://figshare.com/ndownloader/files/21188598" print(f"did not find {data_file}, downloading {src}") urllib.request.urlretrieve(src, data_file) with zipfile.ZipFile(data_file, "r") as zip_ref: zip_ref.extractall(repository_path) with catchtime("converting"): bench = Benchmark(str(repository_path / "data_2k_lw.json"), cache=False) bb_dict = { task: convert_task(bench, task) for task in bench.get_dataset_names() } with catchtime("saving to disk"): serialize( bb_dict=bb_dict, path=repository_path / self.name, metadata={ metric_elapsed_time: METRIC_ELAPSED_TIME, default_metric: METRIC_ACCURACY, resource_attr: RESOURCE_ATTR, }, )
if __name__ == "__main__": LCBenchRecipe().generate()