diff --git a/doc/source/deploy-on-kubernetes.rst b/doc/source/deploy-on-kubernetes.rst new file mode 100644 index 000000000..baaee56f3 --- /dev/null +++ b/doc/source/deploy-on-kubernetes.rst @@ -0,0 +1,143 @@ +Deploying on Kubernetes +======================= + +.. warning:: + + These instructions have not been tested extensively. If you have a suggestion + for how to improve them, please open a pull request or email + ray-dev@googlegroups.com. + +You can run Ray on top of Kubernetes. This document assumes that you have access +to a Kubernetes cluster and have ``kubectl`` installed locally. + +Start by cloning the Ray repository. + +.. code-block:: shell + + git clone https://github.com/ray-project/ray.git + +Work Interactively on the Cluster +--------------------------------- + +To work interactively, first start Ray on Kubernetes. + +.. code-block:: shell + + kubectl create -f ray/kubernetes/head.yaml + kubectl create -f ray/kubernetes/worker.yaml + +This will start one head pod and 3 worker pods. You can check that the pods are +running by running ``kubectl get pods``. + +You should see something like the following (you will have to wait a couple +minutes for the pods to enter the "Running" state). + +.. code-block:: shell + + $ kubectl get pods + NAME READY STATUS RESTARTS AGE + ray-head-controller-2kkfq 1/1 Running 0 47s + ray-worker-controller-d6jml 1/1 Running 0 45s + ray-worker-controller-m7jxs 1/1 Running 0 45s + ray-worker-controller-rg2sl 1/1 Running 0 45s + +To run tasks interactively on the cluster, connect to one of the pods, e.g., + +.. code-block:: shell + + kubectl exec -it ray-head-controller-2kkfq -- bash + +Start an IPython interpreter, e.g., ``ipython`` + +.. code-block:: python + + from collections import Counter + import socket + import time + import ray + + ray.init(redis_address="{}:6379".format(socket.gethostbyname("ray-head"))) + + @ray.remote + def f(x): + time.sleep(0.01) + return x + (ray.services.get_node_ip_address(), ) + + # Check that objects can be transferred from each node to each other node. + %time Counter(ray.get([f.remote(f.remote(())) for _ in range(1000)])) + +Submitting a Script to the Cluster +---------------------------------- + +To submit a self-contained Ray application to your Kubernetes cluster, do the +following. + +.. code-block:: shell + + kubectl create -f ray/kubernetes/submit.yaml + +One of the pods will download and run `this example script`_. + +.. _`this example script`: https://github.com/ray-project/ray/tree/master/kubernetes/example.py + +The script prints its output. To view the output, first find the pod name by +running ``kubectl get all``. You'll see output like the following. + +.. code-block:: shell + + $ kubectl get all + NAME READY STATUS RESTARTS AGE + pod/ray-head-controller-q6lck 1/1 Running 0 1m + pod/ray-worker-controller-kchfh 1/1 Running 0 1m + pod/ray-worker-controller-nmq5c 1/1 Running 0 1m + pod/ray-worker-controller-tfl2q 1/1 Running 0 1m + + NAME DESIRED CURRENT READY AGE + replicationcontroller/ray-head-controller 1 1 1 1m + replicationcontroller/ray-worker-controller 3 3 3 1m + + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + service/ray-head ClusterIP 10.64.5.153 6379/TCP,6380/TCP,6381/TCP,12345/TCP,12346/TCP 1m + +Find the name of the ``ray-head-controller`` pod and run the equivalent of + +.. code-block:: shell + + kubectl logs ray-head-controller-q6lck + +Cleaning Up +----------- + +To remove the services you have created, run the following. + +.. code-block:: shell + + kubectl delete service/ray-head \ + replicationcontroller/ray-head-controller \ + replicationcontroller/ray-worker-controller + + +Customization +------------- + +You will probably need to do some amount of customization. + +1. The example above uses the Docker image ``rayproject/examples``, which is + built using `these Dockerfiles`_. You will most likely need to use your own + Docker image. +2. You will need to modify the ``command`` and ``args`` fields to potentially + install and run the script of your choice. +3. You will need to customize the resource requests. + +TODO +---- + +The following are also important but haven't been documented yet. Contributions +are welcome! + +1. Request CPU/GPU/memory resources. +2. Increase shared memory. +3. How to make Kubernetes clean itself up once the script finishes. +4. Follow Kubernetes best practices. + +.. _`these Dockerfiles`: https://github.com/ray-project/ray/tree/master/docker diff --git a/doc/source/index.rst b/doc/source/index.rst index 27d4667ce..777d1d6b2 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -53,6 +53,7 @@ Ray comes with libraries that accelerate deep learning and reinforcement learnin :caption: Installation installation.rst + deploy-on-kubernetes.rst install-on-docker.rst installation-troubleshooting.rst diff --git a/kubernetes/example.py b/kubernetes/example.py new file mode 100644 index 000000000..b18ee6451 --- /dev/null +++ b/kubernetes/example.py @@ -0,0 +1,38 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import Counter +import socket +import sys +import time +import ray + +if __name__ == "__main__": + ray.init(redis_address="{}:6379".format(socket.gethostbyname("ray-head"))) + + # Wait for all 4 nodes to join the cluster. + while True: + num_nodes = len(ray.global_state.client_table()) + if num_nodes < 4: + print("{} nodes have joined so far. Waiting for more." + .format(num_nodes)) + sys.stdout.flush() + time.sleep(1) + else: + break + + @ray.remote + def f(x): + time.sleep(0.01) + return x + (ray.services.get_node_ip_address(), ) + + # Check that objects can be transferred from each node to each other node. + for i in range(100): + print("Iteration {}".format(i)) + sys.stdout.flush() + print(Counter(ray.get([f.remote(f.remote(())) for _ in range(10000)]))) + sys.stdout.flush() + + print("Success!") + sys.stdout.flush() diff --git a/kubernetes/head.yaml b/kubernetes/head.yaml new file mode 100644 index 000000000..c30f43c75 --- /dev/null +++ b/kubernetes/head.yaml @@ -0,0 +1,57 @@ +apiVersion: v1 +kind: Service +metadata: + name: ray-head +spec: + ports: + ports: + - name: redis-primary + port: 6379 + targetPort: 6379 + - name: redis-shard-0 + port: 6380 + targetPort: 6380 + - name: redis-shard-1 + port: 6381 + targetPort: 6381 + - name: object-manager + port: 12345 + targetPort: 12345 + - name: node-manager + port: 12346 + targetPort: 12346 + selector: + component: ray-head +--- +apiVersion: v1 +kind: ReplicationController +metadata: + name: ray-head-controller +spec: + replicas: 1 + selector: + component: ray-head + template: + metadata: + labels: + component: ray-head + spec: + containers: + - name: ray-head + image: rayproject/examples + command: [ "/bin/bash", "-c", "--" ] + args: ["ray start --head --redis-port=6379 --redis-shard-ports=6380,6381 --object-manager-port=12345 --node-manager-port=12346 --node-ip-address=$MY_POD_IP && while true; do sleep 30; done;"] + ports: + - containerPort: 6379 + - containerPort: 6380 + - containerPort: 6381 + - containerPort: 12345 + - containerPort: 12346 + env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + resources: + requests: + cpu: 10 diff --git a/kubernetes/submit.yaml b/kubernetes/submit.yaml new file mode 100644 index 000000000..52e8c1b1c --- /dev/null +++ b/kubernetes/submit.yaml @@ -0,0 +1,90 @@ +apiVersion: v1 +kind: Service +metadata: + name: ray-head +spec: + ports: + ports: + - name: redis-primary + port: 6379 + targetPort: 6379 + - name: redis-shard-0 + port: 6380 + targetPort: 6380 + - name: redis-shard-1 + port: 6381 + targetPort: 6381 + - name: object-manager + port: 12345 + targetPort: 12345 + - name: node-manager + port: 12346 + targetPort: 12346 + selector: + component: ray-head +--- +apiVersion: v1 +kind: ReplicationController +metadata: + name: ray-head-controller +spec: + replicas: 1 + selector: + component: ray-head + template: + metadata: + labels: + component: ray-head + spec: + containers: + - name: ray-head + image: rayproject/examples + command: [ "/bin/bash", "-c", "--" ] + args: + - "wget https://raw.githubusercontent.com/ray-project/ray/master/kubernetes/example.py && + ray start --head --redis-port=6379 --redis-shard-ports=6380,6381 --object-manager-port=12345 --node-manager-port=12346 --node-ip-address=$MY_POD_IP && + python example.py" + ports: + - containerPort: 6379 + - containerPort: 6380 + - containerPort: 6381 + - containerPort: 12345 + - containerPort: 12346 + env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + resources: + requests: + cpu: 10 +--- +apiVersion: v1 +kind: ReplicationController +metadata: + name: ray-worker-controller +spec: + replicas: 3 + selector: + component: ray-worker + template: + metadata: + labels: + component: ray-worker + spec: + containers: + - name: ray-worker + image: rayproject/examples + command: ["/bin/bash", "-c", "--"] + args: ["ray start --node-ip-address=$MY_POD_IP --redis-address=$(python -c 'import socket;import sys; sys.stdout.write(socket.gethostbyname(\"ray-head\"));sys.stdout.flush()'):6379 --object-manager-port=12345 --node-manager-port=12346 && while true; do sleep 30; done;"] + ports: + - containerPort: 12345 + - containerPort: 12346 + env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + resources: + requests: + cpu: 10 diff --git a/kubernetes/worker.yaml b/kubernetes/worker.yaml new file mode 100644 index 000000000..c4d9204eb --- /dev/null +++ b/kubernetes/worker.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: ReplicationController +metadata: + name: ray-worker-controller +spec: + replicas: 3 + selector: + component: ray-worker + template: + metadata: + labels: + component: ray-worker + spec: + containers: + - name: ray-worker + image: rayproject/examples + command: ["/bin/bash", "-c", "--"] + args: ["ray start --node-ip-address=$MY_POD_IP --redis-address=$(python -c 'import socket;import sys; sys.stdout.write(socket.gethostbyname(\"ray-head\"));sys.stdout.flush()'):6379 --object-manager-port=12345 --node-manager-port=12346 && while true; do sleep 30; done;"] + ports: + - containerPort: 12345 + - containerPort: 12346 + env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + resources: + requests: + cpu: 10