diff --git a/ci/jenkins_tests/run_multi_node_tests.sh b/ci/jenkins_tests/run_multi_node_tests.sh index a1f622f32..645b376cc 100755 --- a/ci/jenkins_tests/run_multi_node_tests.sh +++ b/ci/jenkins_tests/run_multi_node_tests.sh @@ -26,6 +26,12 @@ $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/doc/examples/plot_hyperparameter.py +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/doc/examples/doc_code/torch_example.py + +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/doc/examples/doc_code/tf_example.py + ######################## RLLIB TESTS ################################# source $ROOT_DIR/run_rllib_tests.sh @@ -56,12 +62,6 @@ $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/experimental/sgd/examples/tune_example.py --num-replicas=2 -$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/doc/examples/doc_code/torch_example.py - -$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/doc/examples/doc_code/tf_example.py - $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/experimental/sgd/examples/tensorflow_train_example.py diff --git a/doc/examples/doc_code/tf_example.py b/doc/examples/doc_code/tf_example.py index 01a7f5d4c..a1380ba80 100644 --- a/doc/examples/doc_code/tf_example.py +++ b/doc/examples/doc_code/tf_example.py @@ -9,11 +9,11 @@ in the documentation. # yapf: disable # __tf_model_start__ -import tensorflow as tf from tensorflow.keras import layers def create_keras_model(): + import tensorflow as tf model = tf.keras.Sequential() # Adds a densely-connected layer with 64 units to the model: model.add(layers.Dense(64, activation="relu", input_shape=(32, ))) @@ -23,7 +23,7 @@ def create_keras_model(): model.add(layers.Dense(10, activation="softmax")) model.compile( - optimizer=tf.train.RMSPropOptimizer(0.01), + optimizer=tf.keras.optimizers.RMSprop(0.01), loss=tf.keras.losses.categorical_crossentropy, metrics=[tf.keras.metrics.categorical_accuracy]) return model diff --git a/doc/source/tf_distributed_training.rst b/doc/source/tf_distributed_training.rst index 04dd68f4f..4507024f4 100644 --- a/doc/source/tf_distributed_training.rst +++ b/doc/source/tf_distributed_training.rst @@ -1,9 +1,9 @@ -TensorFlow Distributed Training API (Experimental) -================================================== +TF Distributed Training +======================= Ray's ``TFTrainer`` simplifies distributed model training for Tensorflow. The ``TFTrainer`` is a wrapper around ``MultiWorkerMirroredStrategy`` with a Python API to easily incorporate distributed training into a larger Python application, as opposed to write custom logic of setting environments and starting separate processes. -.. important:: This API has only been tested with TensorFlow2.0rc. +.. important:: This API has only been tested with TensorFlow2.0rc and is still highly experimental. Please file bug reports if you run into any - thanks! ---------- diff --git a/doc/source/using-ray-with-tensorflow.rst b/doc/source/using-ray-with-tensorflow.rst index 39acabe49..e755c91cc 100644 --- a/doc/source/using-ray-with-tensorflow.rst +++ b/doc/source/using-ray-with-tensorflow.rst @@ -1,7 +1,54 @@ Best Practices: Ray with Tensorflow =================================== -This document describes best practices for using Ray with TensorFlow. Feel free to contribute if you think this document is missing anything. +This document describes best practices for using the Ray core APIs with TensorFlow. Ray also provides higher-level utilities for working with Tensorflow, such as distributed training APIs (`training tensorflow example`_), Tune for hyperparameter search (`Tune tensorflow example`_), RLlib for reinforcement learning (`RLlib tensorflow example`_). + +.. _`training tensorflow example`: tf_distributed_training.html +.. _`Tune tensorflow example`: https://github.com/ray-project/ray/blob/master/python/ray/tune/examples/tf_mnist_example.py +.. _`RLlib tensorflow example`: rllib-models.html#tensorflow-models + +Feel free to contribute if you think this document is missing anything. + + +Common Issues: Pickling +----------------------- + +One common issue with TensorFlow2.0 is a pickling error like the following: + +.. code-block:: + + File "/home/***/venv/lib/python3.6/site-packages/ray/actor.py", line 322, in remote + return self._remote(args=args, kwargs=kwargs) + File "/home/***/venv/lib/python3.6/site-packages/ray/actor.py", line 405, in _remote + self._modified_class, self._actor_method_names) + File "/home/***/venv/lib/python3.6/site-packages/ray/function_manager.py", line 578, in export_actor_class + "class": pickle.dumps(Class), + File "/home/***/venv/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py", line 1123, in dumps + cp.dump(obj) + File "/home/***/lib/python3.6/site-packages/ray/cloudpickle/cloudpickle.py", line 482, in dump + return Pickler.dump(self, obj) + File "/usr/lib/python3.6/pickle.py", line 409, in dump + self.save(obj) + File "/usr/lib/python3.6/pickle.py", line 476, in save + f(self, obj) # Call unbound method with explicit self + File "/usr/lib/python3.6/pickle.py", line 751, in save_tuple + save(element) + File "/usr/lib/python3.6/pickle.py", line 808, in _batch_appends + save(tmp[0]) + File "/usr/lib/python3.6/pickle.py", line 496, in save + rv = reduce(self.proto) + TypeError: can't pickle _LazyLoader objects + +To resolve this, you should move all instances of ``import tensorflow`` into the Ray actor or function, as follows: + +.. code-block:: + + def create_model(): + import tensorflow as tf + ... + +This issue is caused by side-effects of importing TensorFlow and setting global state. + Use Actors for Parallel Models ------------------------------