diff --git a/doc/source/autoscaling.rst b/doc/source/autoscaling.rst index dbe6aba9b..c3f3f92c5 100644 --- a/doc/source/autoscaling.rst +++ b/doc/source/autoscaling.rst @@ -12,7 +12,7 @@ as described in `the boto docs `__ cluster config file will create a small cluster with a m4.large head node (on-demand), and two m4.large `spot workers `__, configured to autoscale up to four m4.large workers. Try it out by running these commands from your personal computer. Once the cluster is started, you can then -SSH into the head node to run Ray programs with ``ray.init(redis_address=ray.services.get_node_ip_address() + ":6379")``. +SSH into the head node, ``source activate tensorflow_p36``, and then run Ray programs with ``ray.init(redis_address=ray.services.get_node_ip_address() + ":6379")``. .. code-block:: bash @@ -31,7 +31,7 @@ To run connect to applications running on the cluster (e.g. Jupyter notebook) us .. code-block:: bash - $ ssh -L 8899:localhost:8899 -i @ jupyter notebook --port=8899 + $ ssh -L 8899:localhost:8899 -i @ 'source ~/anaconda3/bin/activate tensorflow_p36 && jupyter notebook --port=8899' Updating your cluster --------------------- @@ -57,7 +57,7 @@ The default idle timeout is 5 minutes. This is to prevent excessive node churn w Monitoring cluster status ------------------------- -You can monitor cluster usage and auto-scaling status by tailing the autoscaling logs in ``/tmp/raylogs/monitor-*.log``. +You can monitor cluster usage and auto-scaling status by tailing the autoscaling logs in ``/tmp/raylogs/monitor-*``. The Ray autoscaler also reports per-node status in the form of instance tags. In the AWS console, you can click on a Node, go the the "Tags" pane, and add the ``ray:NodeStatus`` tag as a column. This lets you see per-node statuses at a glance: diff --git a/doc/source/tune.rst b/doc/source/tune.rst index f93463b0b..04ad02d1f 100644 --- a/doc/source/tune.rst +++ b/doc/source/tune.rst @@ -73,7 +73,7 @@ This script runs a small grid search over the ``my_func`` function using Ray Tun - my_func_4_alpha=0.4,beta=2: RUNNING [pid=6800], 209 s, 41204 ts, 70.1 acc - my_func_5_alpha=0.6,beta=2: TERMINATED [pid=6809], 10 s, 2164 ts, 100 acc -In order to report incremental progress, ``my_func`` periodically calls the ``reporter`` function passed in by Ray Tune to return the current timestep and other metrics as defined in `ray.tune.result.TrainingResult `__. Incremental results will be saved to local disk and optionally uploaded to the specified ``upload_dir`` (e.g. S3 path). +In order to report incremental progress, ``my_func`` periodically calls the ``reporter`` function passed in by Ray Tune to return the current timestep and other metrics as defined in `ray.tune.result.TrainingResult `__. Incremental results will be synced to local disk on the head node of the cluster and optionally uploaded to the specified ``upload_dir`` (e.g. S3 path). Visualizing Results ------------------- @@ -199,6 +199,29 @@ Trial Checkpointing To enable checkpoint / resume, you must subclass ``Trainable`` and implement its ``_train``, ``_save``, and ``_restore`` abstract methods `(example) `__: Implementing this interface is required to support resource multiplexing in schedulers such as HyperBand and PBT. +For TensorFlow model training, this would look something like this `(full tensorflow example) `__: + +.. code-block:: python + + class MyClass(Trainable): + def _setup(self): + self.saver = tf.train.Saver() + self.sess = ... + self.iteration = 0 + + def _train(self): + self.sess.run(...) + self.iteration += 1 + + def _save(self, checkpoint_dir): + return self.saver.save( + self.sess, checkpoint_dir + "/save", + global_step=self.iteration) + + def _restore(self, path): + return self.saver.restore(self.sess, path) + + Additionally, checkpointing can be used to provide fault-tolerance for experiments. This can be enabled by setting ``checkpoint_freq: N`` and ``max_failures: M`` to checkpoint trials every *N* iterations and recover from up to *M* crashes per trial, e.g.: .. code-block:: python diff --git a/python/ray/autoscaler/aws/development-example.yaml b/python/ray/autoscaler/aws/development-example.yaml index c0fec89e3..678f8365f 100644 --- a/python/ray/autoscaler/aws/development-example.yaml +++ b/python/ray/autoscaler/aws/development-example.yaml @@ -13,6 +13,7 @@ max_workers: 2 # usage. For example, if a cluster of 10 nodes is 100% busy and # target_utilization is 0.8, it would resize the cluster to 13. This fraction # can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. target_utilization_fraction: 0.8 # If a node is idle for this many minutes, it will be removed. @@ -80,6 +81,10 @@ file_mounts: { # List of shell commands to run to set up nodes. setup_commands: + # Consider uncommenting these if you run into dpkg locking issues + # - sudo pkill -9 apt-get || true + # - sudo pkill -9 dpkg || true + # - sudo dpkg --configure -a # Install basics. - sudo apt-get update - sudo apt-get install -y cmake pkg-config build-essential autoconf curl libtool unzip python diff --git a/python/ray/autoscaler/aws/example.yaml b/python/ray/autoscaler/aws/example.yaml index 25fe8052c..ae098e616 100644 --- a/python/ray/autoscaler/aws/example.yaml +++ b/python/ray/autoscaler/aws/example.yaml @@ -20,6 +20,7 @@ docker: # usage. For example, if a cluster of 10 nodes is 100% busy and # target_utilization is 0.8, it would resize the cluster to 13. This fraction # can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. target_utilization_fraction: 0.8 # If a node is idle for this many minutes, it will be removed. @@ -84,7 +85,11 @@ setup_commands: # Note: if you're developing Ray, you probably want to create an AMI that # has your Ray repo pre-cloned. Then, you can replace the pip installs # below with a git checkout (and possibly a recompile). - - most_recent() { echo pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/$(aws s3 ls s3://ray-wheels --recursive | grep $1 | sort -r | head -n 1 | awk '{print $4}'); } && $( most_recent "cp36-cp36m-manylinux1" ) || $( most_recent "cp35-cp35m-manylinux1" ) + - source activate tensorflow_p36 && most_recent() { echo pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/$(aws s3 ls s3://ray-wheels --recursive | grep $1 | sort -r | head -n 1 | awk '{print $4}'); } && $( most_recent "cp36-cp36m-manylinux1" ) || $( most_recent "cp35-cp35m-manylinux1" ) + # Consider uncommenting these if you also want to run apt-get commands during setup + # - sudo pkill -9 apt-get || true + # - sudo pkill -9 dpkg || true + # - sudo dpkg --configure -a # Custom commands that will be run on the head node after common setup. head_setup_commands: diff --git a/python/ray/tune/examples/tune_mnist_ray_hyperband.py b/python/ray/tune/examples/tune_mnist_ray_hyperband.py new file mode 100755 index 000000000..adbff1be5 --- /dev/null +++ b/python/ray/tune/examples/tune_mnist_ray_hyperband.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python +# +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""A deep MNIST classifier using convolutional layers. +See extensive documentation at +https://www.tensorflow.org/get_started/mnist/pros +""" +# Disable linter warnings to maintain consistency with tutorial. +# pylint: disable=invalid-name +# pylint: disable=g-bad-import-order + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import time + +import ray +from ray.tune import grid_search, run_experiments, register_trainable, \ + Trainable, TrainingResult +from ray.tune.hyperband import HyperBandScheduler +from tensorflow.examples.tutorials.mnist import input_data + +import tensorflow as tf +import numpy as np + +activation_fn = None # e.g. tf.nn.relu + + +def setupCNN(x): + """setupCNN builds the graph for a deep net for classifying digits. + Args: + x: an input tensor with the dimensions (N_examples, 784), where 784 is + the number of pixels in a standard MNIST image. + Returns: + A tuple (y, keep_prob). y is a tensor of shape (N_examples, 10), with + values equal to the logits of classifying the digit into one of 10 + classes (the digits 0-9). keep_prob is a scalar placeholder for the + probability of dropout. + """ + # Reshape to use within a convolutional neural net. + # Last dimension is for "features" - there is only one here, since images + # are grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc. + with tf.name_scope('reshape'): + x_image = tf.reshape(x, [-1, 28, 28, 1]) + + # First convolutional layer - maps one grayscale image to 32 feature maps. + with tf.name_scope('conv1'): + W_conv1 = weight_variable([5, 5, 1, 32]) + b_conv1 = bias_variable([32]) + h_conv1 = activation_fn(conv2d(x_image, W_conv1) + b_conv1) + + # Pooling layer - downsamples by 2X. + with tf.name_scope('pool1'): + h_pool1 = max_pool_2x2(h_conv1) + + # Second convolutional layer -- maps 32 feature maps to 64. + with tf.name_scope('conv2'): + W_conv2 = weight_variable([5, 5, 32, 64]) + b_conv2 = bias_variable([64]) + h_conv2 = activation_fn(conv2d(h_pool1, W_conv2) + b_conv2) + + # Second pooling layer. + with tf.name_scope('pool2'): + h_pool2 = max_pool_2x2(h_conv2) + + # Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image + # is down to 7x7x64 feature maps -- maps this to 1024 features. + with tf.name_scope('fc1'): + W_fc1 = weight_variable([7 * 7 * 64, 1024]) + b_fc1 = bias_variable([1024]) + + h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64]) + h_fc1 = activation_fn(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) + + # Dropout - controls the complexity of the model, prevents co-adaptation of + # features. + with tf.name_scope('dropout'): + keep_prob = tf.placeholder(tf.float32) + h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) + + # Map the 1024 features to 10 classes, one for each digit + with tf.name_scope('fc2'): + W_fc2 = weight_variable([1024, 10]) + b_fc2 = bias_variable([10]) + + y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2 + return y_conv, keep_prob + + +def conv2d(x, W): + """conv2d returns a 2d convolution layer with full stride.""" + return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') + + +def max_pool_2x2(x): + """max_pool_2x2 downsamples a feature map by 2X.""" + return tf.nn.max_pool( + x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') + + +def weight_variable(shape): + """weight_variable generates a weight variable of a given shape.""" + initial = tf.truncated_normal(shape, stddev=0.1) + return tf.Variable(initial) + + +def bias_variable(shape): + """bias_variable generates a bias variable of a given shape.""" + initial = tf.constant(0.1, shape=shape) + return tf.Variable(initial) + + +class TrainMNIST(Trainable): + """Example MNIST trainable.""" + + def _setup(self): + global activation_fn + + self.timestep = 0 + + # Import data + for _ in range(10): + try: + self.mnist = input_data.read_data_sets( + "/tmp/mnist_ray_demo", one_hot=True) + break + except Exception as e: + print("Error loading data, retrying", e) + time.sleep(5) + + assert self.mnist + + self.x = tf.placeholder(tf.float32, [None, 784]) + self.y_ = tf.placeholder(tf.float32, [None, 10]) + + activation_fn = getattr(tf.nn, self.config['activation']) + + # Build the graph for the deep net + y_conv, self.keep_prob = setupCNN(self.x) + + with tf.name_scope('loss'): + cross_entropy = tf.nn.softmax_cross_entropy_with_logits( + labels=self.y_, logits=y_conv) + cross_entropy = tf.reduce_mean(cross_entropy) + + with tf.name_scope('adam_optimizer'): + train_step = tf.train.AdamOptimizer( + self.config['learning_rate']).minimize(cross_entropy) + + self.train_step = train_step + + with tf.name_scope('accuracy'): + correct_prediction = tf.equal( + tf.argmax(y_conv, 1), tf.argmax(self.y_, 1)) + correct_prediction = tf.cast(correct_prediction, tf.float32) + self.accuracy = tf.reduce_mean(correct_prediction) + + self.sess = tf.Session() + self.sess.run(tf.global_variables_initializer()) + self.iterations = 0 + self.saver = tf.train.Saver() + + def _train(self): + for i in range(10): + batch = self.mnist.train.next_batch(50) + self.sess.run( + self.train_step, + feed_dict={ + self.x: batch[0], self.y_: batch[1], self.keep_prob: 0.5 + }) + + batch = self.mnist.train.next_batch(50) + train_accuracy = self.sess.run( + self.accuracy, + feed_dict={ + self.x: batch[0], self.y_: batch[1], self.keep_prob: 1.0 + }) + + self.iterations += 1 + return TrainingResult( + timesteps_this_iter=10, mean_accuracy=train_accuracy) + + def _save(self, checkpoint_dir): + return self.saver.save( + self.sess, checkpoint_dir + "/save", global_step=self.iterations) + + def _restore(self, path): + return self.saver.restore(self.sess, path) + + +# !!! Example of using the ray.tune Python API !!! +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--smoke-test', action='store_true', help='Finish quickly for testing') + args, _ = parser.parse_known_args() + + register_trainable("my_class", TrainMNIST) + mnist_spec = { + 'run': 'my_class', + 'stop': { + 'mean_accuracy': 0.99, + 'time_total_s': 600, + }, + 'config': { + 'learning_rate': lambda spec: 10 ** np.random.uniform(-5, -3), + 'activation': grid_search(['relu', 'elu', 'tanh']), + }, + "repeat": 10, + } + + if args.smoke_test: + mnist_spec['stop']['training_iteration'] = 2 + mnist_spec['repeat'] = 2 + + ray.init() + hyperband = HyperBandScheduler( + time_attr="timesteps_total", reward_attr="mean_accuracy", + max_t=100) + + run_experiments( + {'mnist_hyperband_test': mnist_spec}, scheduler=hyperband) diff --git a/test/jenkins_tests/run_multi_node_tests.sh b/test/jenkins_tests/run_multi_node_tests.sh index 295969a5d..ec0fe2103 100755 --- a/test/jenkins_tests/run_multi_node_tests.sh +++ b/test/jenkins_tests/run_multi_node_tests.sh @@ -202,6 +202,10 @@ docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/tune/examples/hyperband_example.py \ --smoke-test +docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \ + python /ray/python/ray/tune/examples/tune_mnist_ray_hyperband.py \ + --smoke-test + docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/rllib/examples/multiagent_mountaincar.py