Tune XGBoost

Install dependencies

[ ]:

%pip install 'syne-tune[basic]'
%pip install xgboost

[ ]:

from syne_tune import Tuner, StoppingCriterion
from syne_tune.backend import PythonBackend
from syne_tune.config_space import randint, uniform, loguniform
from syne_tune.optimizer.baselines import BayesianOptimization
from syne_tune.experiments import load_experiment

Define the training function

[ ]:

def train(n_estimators: int, max_depth: int, gamma: float, reg_lambda: float):
    ''' Training function (the function to be tuned) with hyperparameters passed in as function arguments

    This example demonstrates training an XGBoost model on the UCI ML hand-written digits dataset.

    Note that the training function must be totally self-contained as it needs to be serialized.
    Everything (including variables and dependencies) must be defined or imported inside the function scope.

    For more information on XGBoost's hyperparameters, see https://xgboost.readthedocs.io/en/stable/parameter.html
    For more information about the dataset, see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html
    '''

    from sklearn.datasets import load_digits
    from sklearn.model_selection import train_test_split
    from syne_tune import Reporter
    import xgboost
    import numpy as np

    X, y = load_digits(return_X_y=True)

    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42)

    report = Reporter()

    clf = xgboost.XGBClassifier(
        n_estimators=n_estimators,
        reg_lambda=reg_lambda,
        gamma=gamma,
        max_depth=max_depth,
    )
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_val)
    accuracy = (np.equal(y_val, y_pred) * 1.0).mean()

    # report metrics back to syne tune
    report(accuracy = accuracy)

Define the tuning parameters

[ ]:

# Hyperparameter configuration space
config_space = {
    "max_depth": randint(1,10),
    "gamma": uniform(1,10),
    "reg_lambda": loguniform(.0000001, 1),
    "n_estimators": randint(5, 15)
}

# Scheduler (i.e., HPO algorithm)
scheduler = BayesianOptimization(
    config_space,
    metric="accuracy",
    mode="max"
)

tuner = Tuner(
    trial_backend=PythonBackend(tune_function=train, config_space=config_space),
    scheduler=scheduler,
    stop_criterion=StoppingCriterion(max_wallclock_time=30),
    n_workers=4,  # how many trials are evaluated in parallel
)

Run the tuning

[ ]:

tuner.run()

tuning_experiment = load_experiment(tuner.name)

print(f"best result found: {tuning_experiment.best_config()}")

tuning_experiment.plot()