From 7c009d22cf80aeb03fdc27b7f3d5497f3cf75e73 Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Fri, 27 Nov 2020 11:36:56 -0800
Subject: [PATCH] [docs] Add xgboost_ray to docs (#12184)

Co-authored-by: Amog Kamsetty <amogkamsetty@yahoo.com>
---
 .travis.yml                               |   1 +
 doc/requirements-doc.txt                  |   1 +
 doc/source/conf.py                        |   1 -
 doc/source/index.rst                      |   1 +
 doc/source/tune/index.rst                 |   2 +
 doc/source/xgboost-ray.rst                | 161 ++++++++++++++++++++++
 python/ray/util/xgboost/BUILD             |  26 ++++
 python/ray/util/xgboost/__init__.py       |  16 +++
 python/ray/util/xgboost/simple_example.py |  46 +++++++
 python/ray/util/xgboost/simple_tune.py    |  78 +++++++++++
 python/requirements_tune.txt              |   1 +
 11 files changed, 333 insertions(+), 1 deletion(-)
 create mode 100644 doc/source/xgboost-ray.rst
 create mode 100644 python/ray/util/xgboost/BUILD
 create mode 100644 python/ray/util/xgboost/__init__.py
 create mode 100644 python/ray/util/xgboost/simple_example.py
 create mode 100644 python/ray/util/xgboost/simple_tune.py

diff --git a/.travis.yml b/.travis.yml
index 9c89febaf..64ca59411 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -432,6 +432,7 @@ matrix:
         # - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,-pytorch,-py37 python/ray/util/sgd/...
         - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 python/ray/util/sgd/...
         - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 python/ray/util/sgd/...
+        - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/...
 
     # Docs: Tests and examples.
     - os: linux
diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt
index 861b8f878..6de01b0c9 100644
--- a/doc/requirements-doc.txt
+++ b/doc/requirements-doc.txt
@@ -29,4 +29,5 @@ tabulate
 uvicorn
 werkzeug
 git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn
+git+git://github.com/ray-project/xgboost_ray@master#xgboost_ray
 scikit-optimize
diff --git a/doc/source/conf.py b/doc/source/conf.py
index c3439949a..d710fe7d7 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -74,7 +74,6 @@ MOCK_MODULES = [
     "torch.utils.data",
     "torch.utils.data.distributed",
     "wandb",
-    "xgboost",
     "zoopt",
 ]
 import scipy.stats
diff --git a/doc/source/index.rst b/doc/source/index.rst
index f0dbc21bb..a2fd85b34 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -285,6 +285,7 @@ Papers
    multiprocessing.rst
    joblib.rst
    iter.rst
+   xgboost-ray.rst
    dask-on-ray.rst
    mars-on-ray.rst
 
diff --git a/doc/source/tune/index.rst b/doc/source/tune/index.rst
index cbc974dab..86f312cf8 100644
--- a/doc/source/tune/index.rst
+++ b/doc/source/tune/index.rst
@@ -1,3 +1,5 @@
+.. _tune-main:
+
 Tune: Scalable Hyperparameter Tuning
 ====================================
 
diff --git a/doc/source/xgboost-ray.rst b/doc/source/xgboost-ray.rst
new file mode 100644
index 000000000..9a9623cf9
--- /dev/null
+++ b/doc/source/xgboost-ray.rst
@@ -0,0 +1,161 @@
+.. _xgboost-ray:
+
+XGBoost on Ray
+==============
+
+This library adds a new backend for XGBoost utilizing Ray.
+
+Please note that this is an early version and both the API and
+the behavior can change without prior notice.
+
+Installation
+------------
+
+You can install XGBoost on Ray (``xgboost_ray``) like this:
+
+.. code-block:: bash
+
+    git clone https://github.com/ray-project/xgboost_ray.git
+    cd xgboost_ray
+    pip install -e .
+
+
+Usage
+-----
+
+After installation, you can import XGBoost on Ray via two ways:
+
+.. code-block:: bash
+
+    import xgboost_ray
+    # or
+    import ray.util.xgboost
+
+
+``xgboost_ray`` provides a drop-in replacement for XGBoost's ``train``
+function. To pass data, instead of using ``xgb.DMatrix`` you will
+have to use ``ray.util.xgboost.RayDMatrix``.
+
+Here is a simplified example:
+
+
+.. literalinclude:: /../../python/ray/util/xgboost/simple_example.py
+   :language: python
+   :start-after:  __xgboost_begin__
+   :end-before:  __xgboost_end__
+
+
+
+Data loading
+------------
+
+Data is passed to ``xgboost_ray`` via a ``RayDMatrix`` object.
+
+The ``RayDMatrix`` lazy loads data and stores it sharded in the
+Ray object store. The Ray XGBoost actors then access these
+shards to run their training on.
+
+A ``RayDMatrix`` support various data and file types, like
+Pandas DataFrames, Numpy Arrays, CSV files and Parquet files.
+
+Example loading multiple parquet files:
+
+.. code-block:: python
+
+    import glob
+    from ray.util.xgboost import RayDMatrix, RayFileType
+
+    # We can also pass a list of files
+    path = list(sorted(glob.glob("/data/nyc-taxi/*/*/*.parquet")))
+
+    # This argument will be passed to pd.read_parquet()
+    columns = [
+        "passenger_count",
+        "trip_distance", "pickup_longitude", "pickup_latitude",
+        "dropoff_longitude", "dropoff_latitude",
+        "fare_amount", "extra", "mta_tax", "tip_amount",
+        "tolls_amount", "total_amount"
+    ]
+
+    dtrain = RayDMatrix(
+        path,
+        label="passenger_count",  # Will select this column as the label
+        columns=columns,
+        filetype=RayFileType.PARQUET)
+
+
+Hyperparameter Tuning
+---------------------
+``xgboost_ray`` integrates with Ray Tune (:ref:`tune-main`) to provide distributed hyperparameter tuning for your
+distributed XGBoost models. You can run multiple ``xgboost_ray`` training runs in parallel, each with a different
+hyperparameter configuration, with each individual training run parallelized.
+
+First, move your training code into a function. This function should take in a ``config`` argument which specifies the
+hyperparameters for the xgboost model.
+
+.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py
+   :language: python
+   :start-after:  __train_begin__
+   :end-before:  __train_end__
+
+Then, you import tune and use tune's search primitives to define a hyperparameter search space.
+
+.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py
+   :language: python
+   :start-after:  __tune_begin__
+   :end-before:  __tune_end__
+
+Finally, you call ``tune.run`` passing in the training function and the ``config``. Internally, tune will resolve the
+hyperparameter search space and invoke the training function multiple times, each with different hyperparameters.
+
+.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py
+   :language: python
+   :start-after:  __tune_run_begin__
+   :end-before:  __tune_run_end__
+
+Make sure you set the ``extra_cpu`` field appropriately so tune is aware of the total number of resources each trial
+requires.
+
+
+Resources
+---------
+
+By default, ``xgboost_ray`` tries to determine the number of CPUs
+available and distributes them evenly across actors.
+
+In the case of very large clusters or clusters with many different
+machine sizes, it makes sense to limit the number of CPUs per actor
+by setting the ``cpus_per_actor`` argument. Consider always
+setting this explicitly.
+
+The number of XGBoost actors always has to be set manually with
+the ``num_actors`` argument.
+
+More examples
+-------------
+
+Fore complete end to end examples, please have a look at
+the `examples folder <https://github.com/ray-project/xgboost_ray/tree/master/examples/>`__:
+
+* `Simple sklearn breastcancer dataset example <https://github.com/ray-project/xgboost_ray/tree/master/examples/simple.py>`__ (requires `sklearn`)
+* `Simple sklearn breastcancer dataset example with Ray Tune <ttps://github.com/ray-project/xgboost_ray/tree/master/examples/simple_tune.py>`__ (requires `sklearn`)
+* `HIGGS classification example <https://github.com/ray-project/xgboost_ray/tree/master/examples/higgs.py>`__
+  * `[download dataset (2.6 GB)] <https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz>`__
+* `HIGGS classification example with Parquet <https://github.com/ray-project/xgboost_ray/tree/master/examples/higgs_parquet.py>`__ (uses the same dataset)
+* `Test data classification <https://github.com/ray-project/xgboost_ray/tree/master/examples/train_on_test_data.py>`__ (uses a self-generated dataset)
+
+Package Reference
+-----------------
+
+
+Training/Validation
+~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: ray.util.xgboost.train
+
+.. autofunction:: ray.util.xgboost.predict
+
+RayDMatrix
+~~~~~~~~~~
+
+.. autoclass:: ray.util.xgboost.RayDMatrix
diff --git a/python/ray/util/xgboost/BUILD b/python/ray/util/xgboost/BUILD
new file mode 100644
index 000000000..41e48cf15
--- /dev/null
+++ b/python/ray/util/xgboost/BUILD
@@ -0,0 +1,26 @@
+# --------------------------------------------------------------------
+# Tests from the python/ray/util/sgd/tests directory.
+# Please keep these sorted alphabetically.
+# --------------------------------------------------------------------
+py_test(
+    name = "simple_example",
+    size = "small",
+    srcs = ["simple_example.py"],
+    deps = [":xgb_lib"],
+    tags = ["exclusive"],
+)
+
+py_test(
+    name = "simple_tune",
+    size="small",
+    srcs = ["simple_tune.py"],
+    deps = [":xgb_lib"],
+    tags = ["exlcusive"]
+)
+
+# This is a dummy test dependency that causes the above tests to be
+# re-run if any of these files changes.
+py_library(
+    name = "xgb_lib",
+    srcs = glob(["**/*.py"], exclude=["tests/*.py"]),
+)
\ No newline at end of file
diff --git a/python/ray/util/xgboost/__init__.py b/python/ray/util/xgboost/__init__.py
new file mode 100644
index 000000000..52c953799
--- /dev/null
+++ b/python/ray/util/xgboost/__init__.py
@@ -0,0 +1,16 @@
+import logging
+
+logger = logging.getLogger(__name__)
+
+train = None
+predict = None
+RayDMatrix = None
+
+try:
+    from xgboost_ray import train, predict, RayDMatrix, RayFileType
+except ImportError:
+    logger.info(
+        "xgboost_ray is not installed. Please run "
+        "`pip install git+https://github.com/ray-project/xgboost_ray`.")
+
+__all__ = ["train", "predict", "RayDMatrix", "RayFileType"]
diff --git a/python/ray/util/xgboost/simple_example.py b/python/ray/util/xgboost/simple_example.py
new file mode 100644
index 000000000..4c29bf6a6
--- /dev/null
+++ b/python/ray/util/xgboost/simple_example.py
@@ -0,0 +1,46 @@
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+
+from ray.util.xgboost import RayDMatrix, train
+
+
+# __xgboost_begin__
+def main():
+    # Load dataset
+    data, labels = datasets.load_breast_cancer(return_X_y=True)
+    # Split into train and test set
+    train_x, test_x, train_y, test_y = train_test_split(
+        data, labels, test_size=0.25)
+
+    train_set = RayDMatrix(train_x, train_y)
+    test_set = RayDMatrix(test_x, test_y)
+
+    # Set config
+    config = {
+        "tree_method": "approx",
+        "objective": "binary:logistic",
+        "eval_metric": ["logloss", "error"],
+        "max_depth": 3,
+    }
+
+    evals_result = {}
+
+    # Train the classifier
+    bst = train(
+        config,
+        train_set,
+        evals=[(test_set, "eval")],
+        evals_result=evals_result,
+        max_actor_restarts=1,
+        checkpoint_path="/tmp/checkpoint/",
+        verbose_eval=False)
+
+    bst.save_model("simple.xgb")
+    print("Final validation error: {:.4f}".format(
+        evals_result["eval"]["error"][-1]))
+
+
+# __xgboost_end__
+
+if __name__ == "__main__":
+    main()
diff --git a/python/ray/util/xgboost/simple_tune.py b/python/ray/util/xgboost/simple_tune.py
new file mode 100644
index 000000000..ddb8769bc
--- /dev/null
+++ b/python/ray/util/xgboost/simple_tune.py
@@ -0,0 +1,78 @@
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+
+from ray.util.xgboost import RayDMatrix, train
+
+# __train_begin__
+num_cpus_per_actor = 1
+num_actors = 1
+
+
+def train_model(config):
+    # Load dataset
+    data, labels = datasets.load_breast_cancer(return_X_y=True)
+    # Split into train and test set
+    train_x, test_x, train_y, test_y = train_test_split(
+        data, labels, test_size=0.25)
+
+    train_set = RayDMatrix(train_x, train_y)
+    test_set = RayDMatrix(test_x, test_y)
+
+    evals_result = {}
+    bst = train(
+        params=config,
+        dtrain=train_set,
+        evals=[(test_set, "eval")],
+        evals_result=evals_result,
+        verbose_eval=False,
+        num_actors=num_actors,
+        cpus_per_actor=num_cpus_per_actor)
+    bst.save_model("model.xgb")
+
+
+# __train_end__
+
+
+def main():
+    # __tune_begin__
+    from ray import tune
+
+    # Set config
+    config = {
+        "tree_method": "approx",
+        "objective": "binary:logistic",
+        "eval_metric": ["logloss", "error"],
+        "eta": tune.loguniform(1e-4, 1e-1),
+        "subsample": tune.uniform(0.5, 1.0),
+        "max_depth": tune.randint(1, 9)
+    }
+    # __tune_end__
+
+    # __tune_run_begin__
+    analysis = tune.run(
+        train_model,
+        config=config,
+        metric="eval-error",
+        mode="min",
+        num_samples=4,
+        resources_per_trial={
+            "cpu": 1,
+            "extra_cpu": num_actors * num_cpus_per_actor
+        })
+
+    # Load the best model checkpoint
+    import xgboost as xgb
+    import os
+
+    # Load in the best performing model.
+    best_bst = xgb.Booster()
+    best_bst.load_model(os.path.join(analysis.best_logdir, "model.xgb"))
+
+    accuracy = 1. - analysis.best_result["eval-error"]
+    print(f"Best model parameters: {analysis.best_config}")
+    print(f"Best model total accuracy: {accuracy:.4f}")
+    # __tune_run_end__
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/requirements_tune.txt b/python/requirements_tune.txt
index 794c0520f..d68d3b3d3 100644
--- a/python/requirements_tune.txt
+++ b/python/requirements_tune.txt
@@ -31,6 +31,7 @@ torchvision>=0.6.0
 # transformers
 git+git://github.com/huggingface/transformers.git@bdcc4b78a27775d1ec8f3fd297cb679c257289db#transformers
 git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn
+git+git://github.com/ray-project/xgboost_ray@master#xgboost_ray
 wandb
 xgboost
 zoopt>=0.4.1