diff --git a/.travis.yml b/.travis.yml
index 9c89febaf..64ca59411 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -432,6 +432,7 @@ matrix:
# - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,-pytorch,-py37 python/ray/util/sgd/...
- ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 python/ray/util/sgd/...
- ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 python/ray/util/sgd/...
+ - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/...
# Docs: Tests and examples.
- os: linux
diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt
index 861b8f878..6de01b0c9 100644
--- a/doc/requirements-doc.txt
+++ b/doc/requirements-doc.txt
@@ -29,4 +29,5 @@ tabulate
uvicorn
werkzeug
git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn
+git+git://github.com/ray-project/xgboost_ray@master#xgboost_ray
scikit-optimize
diff --git a/doc/source/conf.py b/doc/source/conf.py
index c3439949a..d710fe7d7 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -74,7 +74,6 @@ MOCK_MODULES = [
"torch.utils.data",
"torch.utils.data.distributed",
"wandb",
- "xgboost",
"zoopt",
]
import scipy.stats
diff --git a/doc/source/index.rst b/doc/source/index.rst
index f0dbc21bb..a2fd85b34 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -285,6 +285,7 @@ Papers
multiprocessing.rst
joblib.rst
iter.rst
+ xgboost-ray.rst
dask-on-ray.rst
mars-on-ray.rst
diff --git a/doc/source/tune/index.rst b/doc/source/tune/index.rst
index cbc974dab..86f312cf8 100644
--- a/doc/source/tune/index.rst
+++ b/doc/source/tune/index.rst
@@ -1,3 +1,5 @@
+.. _tune-main:
+
Tune: Scalable Hyperparameter Tuning
====================================
diff --git a/doc/source/xgboost-ray.rst b/doc/source/xgboost-ray.rst
new file mode 100644
index 000000000..9a9623cf9
--- /dev/null
+++ b/doc/source/xgboost-ray.rst
@@ -0,0 +1,161 @@
+.. _xgboost-ray:
+
+XGBoost on Ray
+==============
+
+This library adds a new backend for XGBoost utilizing Ray.
+
+Please note that this is an early version and both the API and
+the behavior can change without prior notice.
+
+Installation
+------------
+
+You can install XGBoost on Ray (``xgboost_ray``) like this:
+
+.. code-block:: bash
+
+ git clone https://github.com/ray-project/xgboost_ray.git
+ cd xgboost_ray
+ pip install -e .
+
+
+Usage
+-----
+
+After installation, you can import XGBoost on Ray via two ways:
+
+.. code-block:: bash
+
+ import xgboost_ray
+ # or
+ import ray.util.xgboost
+
+
+``xgboost_ray`` provides a drop-in replacement for XGBoost's ``train``
+function. To pass data, instead of using ``xgb.DMatrix`` you will
+have to use ``ray.util.xgboost.RayDMatrix``.
+
+Here is a simplified example:
+
+
+.. literalinclude:: /../../python/ray/util/xgboost/simple_example.py
+ :language: python
+ :start-after: __xgboost_begin__
+ :end-before: __xgboost_end__
+
+
+
+Data loading
+------------
+
+Data is passed to ``xgboost_ray`` via a ``RayDMatrix`` object.
+
+The ``RayDMatrix`` lazy loads data and stores it sharded in the
+Ray object store. The Ray XGBoost actors then access these
+shards to run their training on.
+
+A ``RayDMatrix`` support various data and file types, like
+Pandas DataFrames, Numpy Arrays, CSV files and Parquet files.
+
+Example loading multiple parquet files:
+
+.. code-block:: python
+
+ import glob
+ from ray.util.xgboost import RayDMatrix, RayFileType
+
+ # We can also pass a list of files
+ path = list(sorted(glob.glob("/data/nyc-taxi/*/*/*.parquet")))
+
+ # This argument will be passed to pd.read_parquet()
+ columns = [
+ "passenger_count",
+ "trip_distance", "pickup_longitude", "pickup_latitude",
+ "dropoff_longitude", "dropoff_latitude",
+ "fare_amount", "extra", "mta_tax", "tip_amount",
+ "tolls_amount", "total_amount"
+ ]
+
+ dtrain = RayDMatrix(
+ path,
+ label="passenger_count", # Will select this column as the label
+ columns=columns,
+ filetype=RayFileType.PARQUET)
+
+
+Hyperparameter Tuning
+---------------------
+``xgboost_ray`` integrates with Ray Tune (:ref:`tune-main`) to provide distributed hyperparameter tuning for your
+distributed XGBoost models. You can run multiple ``xgboost_ray`` training runs in parallel, each with a different
+hyperparameter configuration, with each individual training run parallelized.
+
+First, move your training code into a function. This function should take in a ``config`` argument which specifies the
+hyperparameters for the xgboost model.
+
+.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py
+ :language: python
+ :start-after: __train_begin__
+ :end-before: __train_end__
+
+Then, you import tune and use tune's search primitives to define a hyperparameter search space.
+
+.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py
+ :language: python
+ :start-after: __tune_begin__
+ :end-before: __tune_end__
+
+Finally, you call ``tune.run`` passing in the training function and the ``config``. Internally, tune will resolve the
+hyperparameter search space and invoke the training function multiple times, each with different hyperparameters.
+
+.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py
+ :language: python
+ :start-after: __tune_run_begin__
+ :end-before: __tune_run_end__
+
+Make sure you set the ``extra_cpu`` field appropriately so tune is aware of the total number of resources each trial
+requires.
+
+
+Resources
+---------
+
+By default, ``xgboost_ray`` tries to determine the number of CPUs
+available and distributes them evenly across actors.
+
+In the case of very large clusters or clusters with many different
+machine sizes, it makes sense to limit the number of CPUs per actor
+by setting the ``cpus_per_actor`` argument. Consider always
+setting this explicitly.
+
+The number of XGBoost actors always has to be set manually with
+the ``num_actors`` argument.
+
+More examples
+-------------
+
+Fore complete end to end examples, please have a look at
+the `examples folder `__:
+
+* `Simple sklearn breastcancer dataset example `__ (requires `sklearn`)
+* `Simple sklearn breastcancer dataset example with Ray Tune `__ (requires `sklearn`)
+* `HIGGS classification example `__
+ * `[download dataset (2.6 GB)] `__
+* `HIGGS classification example with Parquet `__ (uses the same dataset)
+* `Test data classification `__ (uses a self-generated dataset)
+
+Package Reference
+-----------------
+
+
+Training/Validation
+~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: ray.util.xgboost.train
+
+.. autofunction:: ray.util.xgboost.predict
+
+RayDMatrix
+~~~~~~~~~~
+
+.. autoclass:: ray.util.xgboost.RayDMatrix
diff --git a/python/ray/util/xgboost/BUILD b/python/ray/util/xgboost/BUILD
new file mode 100644
index 000000000..41e48cf15
--- /dev/null
+++ b/python/ray/util/xgboost/BUILD
@@ -0,0 +1,26 @@
+# --------------------------------------------------------------------
+# Tests from the python/ray/util/sgd/tests directory.
+# Please keep these sorted alphabetically.
+# --------------------------------------------------------------------
+py_test(
+ name = "simple_example",
+ size = "small",
+ srcs = ["simple_example.py"],
+ deps = [":xgb_lib"],
+ tags = ["exclusive"],
+)
+
+py_test(
+ name = "simple_tune",
+ size="small",
+ srcs = ["simple_tune.py"],
+ deps = [":xgb_lib"],
+ tags = ["exlcusive"]
+)
+
+# This is a dummy test dependency that causes the above tests to be
+# re-run if any of these files changes.
+py_library(
+ name = "xgb_lib",
+ srcs = glob(["**/*.py"], exclude=["tests/*.py"]),
+)
\ No newline at end of file
diff --git a/python/ray/util/xgboost/__init__.py b/python/ray/util/xgboost/__init__.py
new file mode 100644
index 000000000..52c953799
--- /dev/null
+++ b/python/ray/util/xgboost/__init__.py
@@ -0,0 +1,16 @@
+import logging
+
+logger = logging.getLogger(__name__)
+
+train = None
+predict = None
+RayDMatrix = None
+
+try:
+ from xgboost_ray import train, predict, RayDMatrix, RayFileType
+except ImportError:
+ logger.info(
+ "xgboost_ray is not installed. Please run "
+ "`pip install git+https://github.com/ray-project/xgboost_ray`.")
+
+__all__ = ["train", "predict", "RayDMatrix", "RayFileType"]
diff --git a/python/ray/util/xgboost/simple_example.py b/python/ray/util/xgboost/simple_example.py
new file mode 100644
index 000000000..4c29bf6a6
--- /dev/null
+++ b/python/ray/util/xgboost/simple_example.py
@@ -0,0 +1,46 @@
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+
+from ray.util.xgboost import RayDMatrix, train
+
+
+# __xgboost_begin__
+def main():
+ # Load dataset
+ data, labels = datasets.load_breast_cancer(return_X_y=True)
+ # Split into train and test set
+ train_x, test_x, train_y, test_y = train_test_split(
+ data, labels, test_size=0.25)
+
+ train_set = RayDMatrix(train_x, train_y)
+ test_set = RayDMatrix(test_x, test_y)
+
+ # Set config
+ config = {
+ "tree_method": "approx",
+ "objective": "binary:logistic",
+ "eval_metric": ["logloss", "error"],
+ "max_depth": 3,
+ }
+
+ evals_result = {}
+
+ # Train the classifier
+ bst = train(
+ config,
+ train_set,
+ evals=[(test_set, "eval")],
+ evals_result=evals_result,
+ max_actor_restarts=1,
+ checkpoint_path="/tmp/checkpoint/",
+ verbose_eval=False)
+
+ bst.save_model("simple.xgb")
+ print("Final validation error: {:.4f}".format(
+ evals_result["eval"]["error"][-1]))
+
+
+# __xgboost_end__
+
+if __name__ == "__main__":
+ main()
diff --git a/python/ray/util/xgboost/simple_tune.py b/python/ray/util/xgboost/simple_tune.py
new file mode 100644
index 000000000..ddb8769bc
--- /dev/null
+++ b/python/ray/util/xgboost/simple_tune.py
@@ -0,0 +1,78 @@
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+
+from ray.util.xgboost import RayDMatrix, train
+
+# __train_begin__
+num_cpus_per_actor = 1
+num_actors = 1
+
+
+def train_model(config):
+ # Load dataset
+ data, labels = datasets.load_breast_cancer(return_X_y=True)
+ # Split into train and test set
+ train_x, test_x, train_y, test_y = train_test_split(
+ data, labels, test_size=0.25)
+
+ train_set = RayDMatrix(train_x, train_y)
+ test_set = RayDMatrix(test_x, test_y)
+
+ evals_result = {}
+ bst = train(
+ params=config,
+ dtrain=train_set,
+ evals=[(test_set, "eval")],
+ evals_result=evals_result,
+ verbose_eval=False,
+ num_actors=num_actors,
+ cpus_per_actor=num_cpus_per_actor)
+ bst.save_model("model.xgb")
+
+
+# __train_end__
+
+
+def main():
+ # __tune_begin__
+ from ray import tune
+
+ # Set config
+ config = {
+ "tree_method": "approx",
+ "objective": "binary:logistic",
+ "eval_metric": ["logloss", "error"],
+ "eta": tune.loguniform(1e-4, 1e-1),
+ "subsample": tune.uniform(0.5, 1.0),
+ "max_depth": tune.randint(1, 9)
+ }
+ # __tune_end__
+
+ # __tune_run_begin__
+ analysis = tune.run(
+ train_model,
+ config=config,
+ metric="eval-error",
+ mode="min",
+ num_samples=4,
+ resources_per_trial={
+ "cpu": 1,
+ "extra_cpu": num_actors * num_cpus_per_actor
+ })
+
+ # Load the best model checkpoint
+ import xgboost as xgb
+ import os
+
+ # Load in the best performing model.
+ best_bst = xgb.Booster()
+ best_bst.load_model(os.path.join(analysis.best_logdir, "model.xgb"))
+
+ accuracy = 1. - analysis.best_result["eval-error"]
+ print(f"Best model parameters: {analysis.best_config}")
+ print(f"Best model total accuracy: {accuracy:.4f}")
+ # __tune_run_end__
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/requirements_tune.txt b/python/requirements_tune.txt
index 794c0520f..d68d3b3d3 100644
--- a/python/requirements_tune.txt
+++ b/python/requirements_tune.txt
@@ -31,6 +31,7 @@ torchvision>=0.6.0
# transformers
git+git://github.com/huggingface/transformers.git@bdcc4b78a27775d1ec8f3fd297cb679c257289db#transformers
git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn
+git+git://github.com/ray-project/xgboost_ray@master#xgboost_ray
wandb
xgboost
zoopt>=0.4.1