From 7c009d22cf80aeb03fdc27b7f3d5497f3cf75e73 Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Fri, 27 Nov 2020 11:36:56 -0800 Subject: [PATCH] [docs] Add xgboost_ray to docs (#12184) Co-authored-by: Amog Kamsetty --- .travis.yml | 1 + doc/requirements-doc.txt | 1 + doc/source/conf.py | 1 - doc/source/index.rst | 1 + doc/source/tune/index.rst | 2 + doc/source/xgboost-ray.rst | 161 ++++++++++++++++++++++ python/ray/util/xgboost/BUILD | 26 ++++ python/ray/util/xgboost/__init__.py | 16 +++ python/ray/util/xgboost/simple_example.py | 46 +++++++ python/ray/util/xgboost/simple_tune.py | 78 +++++++++++ python/requirements_tune.txt | 1 + 11 files changed, 333 insertions(+), 1 deletion(-) create mode 100644 doc/source/xgboost-ray.rst create mode 100644 python/ray/util/xgboost/BUILD create mode 100644 python/ray/util/xgboost/__init__.py create mode 100644 python/ray/util/xgboost/simple_example.py create mode 100644 python/ray/util/xgboost/simple_tune.py diff --git a/.travis.yml b/.travis.yml index 9c89febaf..64ca59411 100644 --- a/.travis.yml +++ b/.travis.yml @@ -432,6 +432,7 @@ matrix: # - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,-pytorch,-py37 python/ray/util/sgd/... - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 python/ray/util/sgd/... - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 python/ray/util/sgd/... + - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/... # Docs: Tests and examples. - os: linux diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index 861b8f878..6de01b0c9 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -29,4 +29,5 @@ tabulate uvicorn werkzeug git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn +git+git://github.com/ray-project/xgboost_ray@master#xgboost_ray scikit-optimize diff --git a/doc/source/conf.py b/doc/source/conf.py index c3439949a..d710fe7d7 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -74,7 +74,6 @@ MOCK_MODULES = [ "torch.utils.data", "torch.utils.data.distributed", "wandb", - "xgboost", "zoopt", ] import scipy.stats diff --git a/doc/source/index.rst b/doc/source/index.rst index f0dbc21bb..a2fd85b34 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -285,6 +285,7 @@ Papers multiprocessing.rst joblib.rst iter.rst + xgboost-ray.rst dask-on-ray.rst mars-on-ray.rst diff --git a/doc/source/tune/index.rst b/doc/source/tune/index.rst index cbc974dab..86f312cf8 100644 --- a/doc/source/tune/index.rst +++ b/doc/source/tune/index.rst @@ -1,3 +1,5 @@ +.. _tune-main: + Tune: Scalable Hyperparameter Tuning ==================================== diff --git a/doc/source/xgboost-ray.rst b/doc/source/xgboost-ray.rst new file mode 100644 index 000000000..9a9623cf9 --- /dev/null +++ b/doc/source/xgboost-ray.rst @@ -0,0 +1,161 @@ +.. _xgboost-ray: + +XGBoost on Ray +============== + +This library adds a new backend for XGBoost utilizing Ray. + +Please note that this is an early version and both the API and +the behavior can change without prior notice. + +Installation +------------ + +You can install XGBoost on Ray (``xgboost_ray``) like this: + +.. code-block:: bash + + git clone https://github.com/ray-project/xgboost_ray.git + cd xgboost_ray + pip install -e . + + +Usage +----- + +After installation, you can import XGBoost on Ray via two ways: + +.. code-block:: bash + + import xgboost_ray + # or + import ray.util.xgboost + + +``xgboost_ray`` provides a drop-in replacement for XGBoost's ``train`` +function. To pass data, instead of using ``xgb.DMatrix`` you will +have to use ``ray.util.xgboost.RayDMatrix``. + +Here is a simplified example: + + +.. literalinclude:: /../../python/ray/util/xgboost/simple_example.py + :language: python + :start-after: __xgboost_begin__ + :end-before: __xgboost_end__ + + + +Data loading +------------ + +Data is passed to ``xgboost_ray`` via a ``RayDMatrix`` object. + +The ``RayDMatrix`` lazy loads data and stores it sharded in the +Ray object store. The Ray XGBoost actors then access these +shards to run their training on. + +A ``RayDMatrix`` support various data and file types, like +Pandas DataFrames, Numpy Arrays, CSV files and Parquet files. + +Example loading multiple parquet files: + +.. code-block:: python + + import glob + from ray.util.xgboost import RayDMatrix, RayFileType + + # We can also pass a list of files + path = list(sorted(glob.glob("/data/nyc-taxi/*/*/*.parquet"))) + + # This argument will be passed to pd.read_parquet() + columns = [ + "passenger_count", + "trip_distance", "pickup_longitude", "pickup_latitude", + "dropoff_longitude", "dropoff_latitude", + "fare_amount", "extra", "mta_tax", "tip_amount", + "tolls_amount", "total_amount" + ] + + dtrain = RayDMatrix( + path, + label="passenger_count", # Will select this column as the label + columns=columns, + filetype=RayFileType.PARQUET) + + +Hyperparameter Tuning +--------------------- +``xgboost_ray`` integrates with Ray Tune (:ref:`tune-main`) to provide distributed hyperparameter tuning for your +distributed XGBoost models. You can run multiple ``xgboost_ray`` training runs in parallel, each with a different +hyperparameter configuration, with each individual training run parallelized. + +First, move your training code into a function. This function should take in a ``config`` argument which specifies the +hyperparameters for the xgboost model. + +.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py + :language: python + :start-after: __train_begin__ + :end-before: __train_end__ + +Then, you import tune and use tune's search primitives to define a hyperparameter search space. + +.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py + :language: python + :start-after: __tune_begin__ + :end-before: __tune_end__ + +Finally, you call ``tune.run`` passing in the training function and the ``config``. Internally, tune will resolve the +hyperparameter search space and invoke the training function multiple times, each with different hyperparameters. + +.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py + :language: python + :start-after: __tune_run_begin__ + :end-before: __tune_run_end__ + +Make sure you set the ``extra_cpu`` field appropriately so tune is aware of the total number of resources each trial +requires. + + +Resources +--------- + +By default, ``xgboost_ray`` tries to determine the number of CPUs +available and distributes them evenly across actors. + +In the case of very large clusters or clusters with many different +machine sizes, it makes sense to limit the number of CPUs per actor +by setting the ``cpus_per_actor`` argument. Consider always +setting this explicitly. + +The number of XGBoost actors always has to be set manually with +the ``num_actors`` argument. + +More examples +------------- + +Fore complete end to end examples, please have a look at +the `examples folder `__: + +* `Simple sklearn breastcancer dataset example `__ (requires `sklearn`) +* `Simple sklearn breastcancer dataset example with Ray Tune `__ (requires `sklearn`) +* `HIGGS classification example `__ + * `[download dataset (2.6 GB)] `__ +* `HIGGS classification example with Parquet `__ (uses the same dataset) +* `Test data classification `__ (uses a self-generated dataset) + +Package Reference +----------------- + + +Training/Validation +~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: ray.util.xgboost.train + +.. autofunction:: ray.util.xgboost.predict + +RayDMatrix +~~~~~~~~~~ + +.. autoclass:: ray.util.xgboost.RayDMatrix diff --git a/python/ray/util/xgboost/BUILD b/python/ray/util/xgboost/BUILD new file mode 100644 index 000000000..41e48cf15 --- /dev/null +++ b/python/ray/util/xgboost/BUILD @@ -0,0 +1,26 @@ +# -------------------------------------------------------------------- +# Tests from the python/ray/util/sgd/tests directory. +# Please keep these sorted alphabetically. +# -------------------------------------------------------------------- +py_test( + name = "simple_example", + size = "small", + srcs = ["simple_example.py"], + deps = [":xgb_lib"], + tags = ["exclusive"], +) + +py_test( + name = "simple_tune", + size="small", + srcs = ["simple_tune.py"], + deps = [":xgb_lib"], + tags = ["exlcusive"] +) + +# This is a dummy test dependency that causes the above tests to be +# re-run if any of these files changes. +py_library( + name = "xgb_lib", + srcs = glob(["**/*.py"], exclude=["tests/*.py"]), +) \ No newline at end of file diff --git a/python/ray/util/xgboost/__init__.py b/python/ray/util/xgboost/__init__.py new file mode 100644 index 000000000..52c953799 --- /dev/null +++ b/python/ray/util/xgboost/__init__.py @@ -0,0 +1,16 @@ +import logging + +logger = logging.getLogger(__name__) + +train = None +predict = None +RayDMatrix = None + +try: + from xgboost_ray import train, predict, RayDMatrix, RayFileType +except ImportError: + logger.info( + "xgboost_ray is not installed. Please run " + "`pip install git+https://github.com/ray-project/xgboost_ray`.") + +__all__ = ["train", "predict", "RayDMatrix", "RayFileType"] diff --git a/python/ray/util/xgboost/simple_example.py b/python/ray/util/xgboost/simple_example.py new file mode 100644 index 000000000..4c29bf6a6 --- /dev/null +++ b/python/ray/util/xgboost/simple_example.py @@ -0,0 +1,46 @@ +from sklearn import datasets +from sklearn.model_selection import train_test_split + +from ray.util.xgboost import RayDMatrix, train + + +# __xgboost_begin__ +def main(): + # Load dataset + data, labels = datasets.load_breast_cancer(return_X_y=True) + # Split into train and test set + train_x, test_x, train_y, test_y = train_test_split( + data, labels, test_size=0.25) + + train_set = RayDMatrix(train_x, train_y) + test_set = RayDMatrix(test_x, test_y) + + # Set config + config = { + "tree_method": "approx", + "objective": "binary:logistic", + "eval_metric": ["logloss", "error"], + "max_depth": 3, + } + + evals_result = {} + + # Train the classifier + bst = train( + config, + train_set, + evals=[(test_set, "eval")], + evals_result=evals_result, + max_actor_restarts=1, + checkpoint_path="/tmp/checkpoint/", + verbose_eval=False) + + bst.save_model("simple.xgb") + print("Final validation error: {:.4f}".format( + evals_result["eval"]["error"][-1])) + + +# __xgboost_end__ + +if __name__ == "__main__": + main() diff --git a/python/ray/util/xgboost/simple_tune.py b/python/ray/util/xgboost/simple_tune.py new file mode 100644 index 000000000..ddb8769bc --- /dev/null +++ b/python/ray/util/xgboost/simple_tune.py @@ -0,0 +1,78 @@ +from sklearn import datasets +from sklearn.model_selection import train_test_split + +from ray.util.xgboost import RayDMatrix, train + +# __train_begin__ +num_cpus_per_actor = 1 +num_actors = 1 + + +def train_model(config): + # Load dataset + data, labels = datasets.load_breast_cancer(return_X_y=True) + # Split into train and test set + train_x, test_x, train_y, test_y = train_test_split( + data, labels, test_size=0.25) + + train_set = RayDMatrix(train_x, train_y) + test_set = RayDMatrix(test_x, test_y) + + evals_result = {} + bst = train( + params=config, + dtrain=train_set, + evals=[(test_set, "eval")], + evals_result=evals_result, + verbose_eval=False, + num_actors=num_actors, + cpus_per_actor=num_cpus_per_actor) + bst.save_model("model.xgb") + + +# __train_end__ + + +def main(): + # __tune_begin__ + from ray import tune + + # Set config + config = { + "tree_method": "approx", + "objective": "binary:logistic", + "eval_metric": ["logloss", "error"], + "eta": tune.loguniform(1e-4, 1e-1), + "subsample": tune.uniform(0.5, 1.0), + "max_depth": tune.randint(1, 9) + } + # __tune_end__ + + # __tune_run_begin__ + analysis = tune.run( + train_model, + config=config, + metric="eval-error", + mode="min", + num_samples=4, + resources_per_trial={ + "cpu": 1, + "extra_cpu": num_actors * num_cpus_per_actor + }) + + # Load the best model checkpoint + import xgboost as xgb + import os + + # Load in the best performing model. + best_bst = xgb.Booster() + best_bst.load_model(os.path.join(analysis.best_logdir, "model.xgb")) + + accuracy = 1. - analysis.best_result["eval-error"] + print(f"Best model parameters: {analysis.best_config}") + print(f"Best model total accuracy: {accuracy:.4f}") + # __tune_run_end__ + + +if __name__ == "__main__": + main() diff --git a/python/requirements_tune.txt b/python/requirements_tune.txt index 794c0520f..d68d3b3d3 100644 --- a/python/requirements_tune.txt +++ b/python/requirements_tune.txt @@ -31,6 +31,7 @@ torchvision>=0.6.0 # transformers git+git://github.com/huggingface/transformers.git@bdcc4b78a27775d1ec8f3fd297cb679c257289db#transformers git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn +git+git://github.com/ray-project/xgboost_ray@master#xgboost_ray wandb xgboost zoopt>=0.4.1