diff --git a/ci/jenkins_tests/run_multi_node_tests.sh b/ci/jenkins_tests/run_multi_node_tests.sh index 89250bd50..90bf860d2 100755 --- a/ci/jenkins_tests/run_multi_node_tests.sh +++ b/ci/jenkins_tests/run_multi_node_tests.sh @@ -308,6 +308,9 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/rllib/examples/parametric_action_cartpole.py --run=DQN --stop=50 +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/rllib/examples/custom_loss.py --iters=2 + docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/rllib/test/test_lstm.py @@ -329,9 +332,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/rllib/test/test_supported_spaces.py -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - pytest /ray/python/ray/tune/test/cluster_tests.py - docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/rllib/test/test_env_with_subprocess.py @@ -352,70 +352,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/rllib/test/multiagent_pendulum.py -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/tune_mnist_ray.py \ - --smoke-test - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/pbt_example.py \ - --smoke-test - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/hyperband_example.py \ - --smoke-test - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/async_hyperband_example.py \ - --smoke-test - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/tune_mnist_ray_hyperband.py \ - --smoke-test - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/tune_mnist_async_hyperband.py \ - --smoke-test - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/logging_example.py \ - --smoke-test - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/bayesopt_example.py \ - --smoke-test - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/hyperopt_example.py \ - --smoke-test - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} -e SIGOPT_KEY $DOCKER_SHA \ - python /ray/python/ray/tune/examples/sigopt_example.py \ - --smoke-test - -# Runs only on Python3 -# docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ -# python /ray/python/ray/tune/examples/nevergrad_example.py \ -# --smoke-test - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/tune_mnist_keras.py \ - --smoke-test - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/mnist_pytorch.py --smoke-test --no-cuda - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/mnist_pytorch_trainable.py \ - --smoke-test --no-cuda - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/genetic_example.py \ - --smoke-test - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/tune/examples/skopt_example.py \ - --smoke-test - docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/rllib/examples/multiagent_cartpole.py --num-iters=2 @@ -524,3 +460,71 @@ python3 $ROOT_DIR/multi_node_docker_test.py \ --mem-size=60G \ --shm-size=60G \ --test-script=/ray/test/jenkins_tests/multi_node_tests/large_memory_test.py + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + pytest /ray/python/ray/tune/test/cluster_tests.py + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/tune_mnist_ray.py \ + --smoke-test + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/pbt_example.py \ + --smoke-test + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/hyperband_example.py \ + --smoke-test + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/async_hyperband_example.py \ + --smoke-test + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/tune_mnist_ray_hyperband.py \ + --smoke-test + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/tune_mnist_async_hyperband.py \ + --smoke-test + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/logging_example.py \ + --smoke-test + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/bayesopt_example.py \ + --smoke-test + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/hyperopt_example.py \ + --smoke-test + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} -e SIGOPT_KEY $DOCKER_SHA \ + python /ray/python/ray/tune/examples/sigopt_example.py \ + --smoke-test + +# Runs only on Python3 +# docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +# python /ray/python/ray/tune/examples/nevergrad_example.py \ +# --smoke-test + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/tune_mnist_keras.py \ + --smoke-test + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/mnist_pytorch.py --smoke-test --no-cuda + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/mnist_pytorch_trainable.py \ + --smoke-test --no-cuda + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/genetic_example.py \ + --smoke-test + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/tune/examples/skopt_example.py \ + --smoke-test + diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst index b66f470ac..a03077c7e 100644 --- a/doc/source/rllib-env.rst +++ b/doc/source/rllib-env.rst @@ -64,6 +64,10 @@ You can also register a custom env creator function with a string name. This fun For a full runnable code example using the custom environment API, see `custom_env.py `__. +.. warning:: + + Please do **not** try to use gym registration to register custom environments. The gym registry is not compatible with Ray. Instead, always use the registration flows documented above. + Configuring Environments ------------------------ diff --git a/doc/source/rllib-examples.rst b/doc/source/rllib-examples.rst index 55c19d8e2..87fd2225c 100644 --- a/doc/source/rllib-examples.rst +++ b/doc/source/rllib-examples.rst @@ -28,6 +28,8 @@ Custom Envs and Models - `Registering a custom env `__: Example of defining and registering a gym env for use with RLlib. +- `Registering a custom model with supervised loss `__: + Example of defining and registering a custom model with a supervised loss. - `Subprocess environment `__: Example of how to ensure subprocesses spawned by envs are killed when RLlib exits. - `Batch normalization `__: diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index 6f827809f..2abd2b533 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -8,8 +8,11 @@ The following diagram provides a conceptual overview of data flow between differ The components highlighted in green can be replaced with custom user-defined implementations, as described in the next sections. The purple components are RLlib internal, which means they can only be modified by changing the algorithm source code. +Default Behaviours +------------------ + Built-in Models and Preprocessors ---------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RLlib picks default models based on a simple heuristic: a `vision network `__ for image observations, and a `fully connected network `__ for everything else. These models can be configured via the ``model`` config key, documented in the model `catalog `__. Note that you'll probably have to configure ``conv_filters`` if your environment observations have custom sizes, e.g., ``"model": {"dim": 42, "conv_filters": [[16, [4, 4], 2], [32, [4, 4], 2], [512, [11, 11], 1]]}`` for 42x42 observations. @@ -30,7 +33,7 @@ The following is a list of the built-in model hyperparameters: Custom Models (TensorFlow) -------------------------- -Custom TF models should subclass the common RLlib `model class `__ and override the ``_build_layers_v2`` method. This method takes in a dict of tensor inputs (the observation ``obs``, ``prev_action``, and ``prev_reward``, ``is_training``), and returns a feature layer and float vector of the specified output size. You can also override the ``value_function`` method to implement a custom value branch. A self-supervised loss can be defined via the ``loss`` method. The model can then be registered and used in place of a built-in model: +Custom TF models should subclass the common RLlib `model class `__ and override the ``_build_layers_v2`` method. This method takes in a dict of tensor inputs (the observation ``obs``, ``prev_action``, and ``prev_reward``, ``is_training``), and returns a feature layer and float vector of the specified output size. You can also override the ``value_function`` method to implement a custom value branch. Additional supervised / self-supervised losses can be added via the ``custom_loss`` method. The model can then be registered and used in place of a built-in model: .. code-block:: python @@ -87,17 +90,38 @@ Custom TF models should subclass the common RLlib `model class `__. + +**TensorFlow**: To add a supervised loss to a custom TF model, you need to override the ``custom_loss()`` method. This method takes in the existing policy loss for the algorithm, which you can add your own supervised loss to before returning. For debugging, you can also return a dictionary of scalar tensors in the ``custom_metrics()`` method. Here is a `runnable example `__ of adding an imitation loss to CartPole training that is defined over a `offline dataset `__. + +**PyTorch**: There is no explicit API for adding losses to custom torch models. However, you can modify the loss in the policy graph definition directly. Like for TF models, offline datasets can be incorporated by creating an input reader and calling ``reader.next()`` in the loss forward pass. + + +Variable-length / Parametric Action Spaces +------------------------------------------ + +Custom models can be used to work with environments where (1) the set of valid actions `varies per step `__, and/or (2) the number of valid actions is `very large `__. The general idea is that the meaning of actions can be completely conditioned on the observation, i.e., the ``a`` in ``Q(s, a)`` becomes just a token in ``[0, MAX_AVAIL_ACTIONS)`` that only has meaning in the context of ``s``. This works with algorithms in the `DQN and policy-gradient families `__ and can be implemented as follows: + +1. The environment should return a mask and/or list of valid action embeddings as part of the observation for each step. To enable batching, the number of actions can be allowed to vary from 1 to some max number: + +.. code-block:: python + + class MyParamActionEnv(gym.Env): + def __init__(self, max_avail_actions): + self.action_space = Discrete(max_avail_actions) + self.observation_space = Dict({ + "action_mask": Box(0, 1, shape=(max_avail_actions, )), + "avail_actions": Box(-1, 1, shape=(max_avail_actions, action_embedding_sz)), + "real_obs": ..., + }) + +2. A custom model can be defined that can interpret the ``action_mask`` and ``avail_actions`` portions of the observation. Here the model computes the action logits via the dot product of some network output and each action embedding. Invalid actions can be masked out of the softmax by scaling the probability to zero: + +.. code-block:: python + + class MyParamActionModel(Model): + def _build_layers_v2(self, input_dict, num_outputs, options): + avail_actions = input_dict["obs"]["avail_actions"] + action_mask = input_dict["obs"]["action_mask"] + + output = FullyConnectedNetwork( + input_dict["obs"]["real_obs"], num_outputs=action_embedding_sz) + + # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the + # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE]. + intent_vector = tf.expand_dims(output, 1) + + # Shape of logits is [BATCH, MAX_ACTIONS]. + action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2) + + # Mask out invalid actions (use tf.float32.min for stability) + inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min) + masked_logits = inf_mask + action_logits + + return masked_logits, last_layer + + +Depending on your use case it may make sense to use just the masking, just action embeddings, or both. For a runnable example of this in code, check out `parametric_action_cartpole.py `__. Note that since masking introduces ``tf.float32.min`` values into the model output, this technique might not work with all algorithm options. For example, algorithms might crash if they incorrectly process the ``tf.float32.min`` values. The cartpole example has working configurations for DQN (must set ``hiddens=[]``), PPO (must disable running mean and set ``vf_share_layers=True``), and several other algorithms. Customizing Policy Graphs ------------------------- @@ -281,55 +360,8 @@ Then, you can create an agent with your custom policy graph by: In this example we overrode existing methods of the existing DDPG policy graph, i.e., `_build_q_network`, `_build_p_network`, `_build_action_network`, `_build_actor_critic_loss`, but you can also replace the entire graph class entirely. -Variable-length / Parametric Action Spaces ------------------------------------------- - -Custom models can be used to work with environments where (1) the set of valid actions varies per step, and/or (2) the number of valid actions is very large, as in `OpenAI Five `__ and `Horizon `__. The general idea is that the meaning of actions can be completely conditioned on the observation, i.e., the ``a`` in ``Q(s, a)`` becomes just a token in ``[0, MAX_AVAIL_ACTIONS)`` that only has meaning in the context of ``s``. This works with algorithms in the `DQN and policy-gradient families `__ and can be implemented as follows: - -1. The environment should return a mask and/or list of valid action embeddings as part of the observation for each step. To enable batching, the number of actions can be allowed to vary from 1 to some max number: - -.. code-block:: python - - class MyParamActionEnv(gym.Env): - def __init__(self, max_avail_actions): - self.action_space = Discrete(max_avail_actions) - self.observation_space = Dict({ - "action_mask": Box(0, 1, shape=(max_avail_actions, )), - "avail_actions": Box(-1, 1, shape=(max_avail_actions, action_embedding_sz)), - "real_obs": ..., - }) - -2. A custom model can be defined that can interpret the ``action_mask`` and ``avail_actions`` portions of the observation. Here the model computes the action logits via the dot product of some network output and each action embedding. Invalid actions can be masked out of the softmax by scaling the probability to zero: - -.. code-block:: python - - class MyParamActionModel(Model): - def _build_layers_v2(self, input_dict, num_outputs, options): - avail_actions = input_dict["obs"]["avail_actions"] - action_mask = input_dict["obs"]["action_mask"] - - output = FullyConnectedNetwork( - input_dict["obs"]["real_obs"], num_outputs=action_embedding_sz) - - # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the - # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE]. - intent_vector = tf.expand_dims(output, 1) - - # Shape of logits is [BATCH, MAX_ACTIONS]. - action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2) - - # Mask out invalid actions (use tf.float32.min for stability) - inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min) - masked_logits = inf_mask + action_logits - - return masked_logits, last_layer - - -Depending on your use case it may make sense to use just the masking, just action embeddings, or both. For a runnable example of this in code, check out `parametric_action_cartpole.py `__. Note that since masking introduces ``tf.float32.min`` values into the model output, this technique might not work with all algorithm options. For example, algorithms might crash if they incorrectly process the ``tf.float32.min`` values. The cartpole example has working configurations for DQN (must set ``hiddens=[]``), PPO (must disable running mean and set ``vf_share_layers=True``), and several other algorithms. - - Model-Based Rollouts --------------------- +~~~~~~~~~~~~~~~~~~~~ With a custom policy graph, you can also perform model-based rollouts and optionally incorporate the results of those rollouts as training data. For example, suppose you wanted to extend PGPolicyGraph for model-based rollouts. This involves overriding the ``compute_actions`` method of that policy graph: diff --git a/doc/source/rllib-offline.rst b/doc/source/rllib-offline.rst index e3e935e97..e69339077 100644 --- a/doc/source/rllib-offline.rst +++ b/doc/source/rllib-offline.rst @@ -129,6 +129,27 @@ Scaling I/O throughput Similar to scaling online training, you can scale offline I/O throughput by increasing the number of RLlib workers via the ``num_workers`` config. Each worker accesses offline storage independently in parallel, for linear scaling of I/O throughput. Within each read worker, files are chosen in random order for reads, but file contents are read sequentially. +Input Pipeline for Supervised Losses +------------------------------------ + +You can also define supervised model losses over offline data. This requires defining a `custom model loss `__. We provide a convenience function, ``InputReader.tf_input_ops()``, that can be used to convert any input reader to a TF input pipeline. For example: + +.. code-block:: python + + def custom_loss(self, policy_loss): + input_reader = JsonReader("/tmp/cartpole-out") + # print(input_reader.next()) # if you want to access imperatively + + input_ops = input_reader.tf_input_ops() + print(input_ops["obs"]) # -> output Tensor shape=[None, 4] + print(input_ops["actions"]) # -> output Tensor shape=[None] + + supervised_loss = some_function_of(input_ops) + return policy_loss + supervised_loss + +See `custom_loss.py `__ for a runnable example of using these TF input ops in a custom loss. + + Input API --------- diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index 289558911..51c7a4f05 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -82,17 +82,17 @@ Algorithms Models and Preprocessors ------------------------ * `RLlib Models and Preprocessors Overview `__ -* `Built-in Models and Preprocessors `__ * `Custom Models (TensorFlow) `__ * `Custom Models (PyTorch) `__ * `Custom Preprocessors `__ -* `Customizing Policy Graphs `__ +* `Supervised Model Losses `__ * `Variable-length / Parametric Action Spaces `__ -* `Model-Based Rollouts `__ +* `Customizing Policy Graphs `__ Offline Datasets ---------------- * `Working with Offline Datasets `__ +* `Input Pipeline for Supervised Losses `__ * `Input API `__ * `Output API `__ diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py index 105996e38..882c64031 100644 --- a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py @@ -98,7 +98,8 @@ class A3CPolicyGraph(LearningRateSchedule, TFPolicyGraph): obs_input=self.observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), - loss=self.model.loss() + self.loss.total_loss, + loss=self.loss.total_loss, + model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py index 4e918a7c2..25ecd4ae8 100644 --- a/python/ray/rllib/agents/agent.py +++ b/python/ray/rllib/agents/agent.py @@ -243,7 +243,7 @@ class Agent(Trainable): self.global_vars = {"timestep": 0} # Agents allow env ids to be passed directly to the constructor. - self._env_id = _register_if_needed(env or config.get("env")) + self._env_id = self._register_if_needed(env or config.get("env")) # Create a default logger creator if no logger_creator is specified if logger_creator is None: @@ -671,11 +671,14 @@ class Agent(Trainable): if "optimizer" in state: self.optimizer.restore(state["optimizer"]) - -def _register_if_needed(env_object): - if isinstance(env_object, six.string_types): - return env_object - elif isinstance(env_object, type): - name = env_object.__name__ - register_env(name, lambda config: env_object(config)) - return name + def _register_if_needed(self, env_object): + if isinstance(env_object, six.string_types): + return env_object + elif isinstance(env_object, type): + name = env_object.__name__ + register_env(name, lambda config: env_object(config)) + return name + raise ValueError( + "{} is an invalid env specification. ".format(env_object) + + "You can specify a custom env as either a class " + "(e.g., YourEnvCls) or a registered env id (e.g., \"your_env\").") diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py index 5f3e7dd5d..0089ed6bf 100644 --- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py +++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py @@ -334,10 +334,11 @@ class DDPGPolicyGraph(TFPolicyGraph): config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) # Model self-supervised losses - self.loss.actor_loss += self.p_model.loss() - self.loss.critic_loss += self.q_model.loss() + self.loss.actor_loss = self.p_model.custom_loss(self.loss.actor_loss) + self.loss.critic_loss = self.q_model.custom_loss(self.loss.critic_loss) if self.config["twin_q"]: - self.loss.critic_loss += self.twin_q_model.loss() + self.loss.critic_loss = self.twin_q_model.custom_loss( + self.loss.critic_loss) # update_target_fn will be called periodically to copy Q network to # target Q network diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy_graph.py index 561bd8be3..d422b0f3c 100644 --- a/python/ray/rllib/agents/dqn/dqn_policy_graph.py +++ b/python/ray/rllib/agents/dqn/dqn_policy_graph.py @@ -410,7 +410,8 @@ class DQNPolicyGraph(TFPolicyGraph): obs_input=self.cur_observations, action_sampler=self.output_actions, action_prob=self.action_prob, - loss=model.loss() + self.loss.loss, + loss=self.loss.loss, + model=model, loss_inputs=self.loss_inputs, update_ops=q_batchnorm_update_ops) self.sess.run(tf.global_variables_initializer()) diff --git a/python/ray/rllib/agents/impala/vtrace_policy_graph.py b/python/ray/rllib/agents/impala/vtrace_policy_graph.py index d8ce1be6e..7a5ccb8c9 100644 --- a/python/ray/rllib/agents/impala/vtrace_policy_graph.py +++ b/python/ray/rllib/agents/impala/vtrace_policy_graph.py @@ -216,7 +216,8 @@ class VTracePolicyGraph(LearningRateSchedule, TFPolicyGraph): obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), - loss=self.model.loss() + self.loss.total_loss, + loss=self.loss.total_loss, + model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, diff --git a/python/ray/rllib/agents/marwil/marwil_policy_graph.py b/python/ray/rllib/agents/marwil/marwil_policy_graph.py index c1a155340..8d52807ef 100644 --- a/python/ray/rllib/agents/marwil/marwil_policy_graph.py +++ b/python/ray/rllib/agents/marwil/marwil_policy_graph.py @@ -114,7 +114,8 @@ class MARWILPolicyGraph(TFPolicyGraph): obs_input=self.obs_t, action_sampler=self.output_actions, action_prob=action_dist.sampled_action_prob(), - loss=self.model.loss() + objective, + loss=objective, + model=self.model, loss_inputs=self.loss_inputs, state_inputs=self.model.state_in, state_outputs=self.model.state_out, diff --git a/python/ray/rllib/agents/mock.py b/python/ray/rllib/agents/mock.py index 89d17848e..ac8efb36b 100644 --- a/python/ray/rllib/agents/mock.py +++ b/python/ray/rllib/agents/mock.py @@ -46,6 +46,9 @@ class _MockAgent(Agent): self.info = info self.restored = True + def _register_if_needed(self, env_object): + pass + def set_info(self, info): self.info = info return info diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index c8f9a1185..4907e00ff 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -68,8 +68,9 @@ class PGPolicyGraph(TFPolicyGraph): obs_input=obs, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), - loss=self.model.loss() + loss, + loss=loss, loss_inputs=loss_in, + model=self.model, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py index e0716f274..cb67e047e 100644 --- a/python/ray/rllib/agents/ppo/appo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py @@ -321,7 +321,8 @@ class AsyncPPOPolicyGraph(LearningRateSchedule, TFPolicyGraph): obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), - loss=self.model.loss() + self.loss.total_loss, + loss=self.loss.total_loss, + model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, @@ -339,7 +340,6 @@ class AsyncPPOPolicyGraph(LearningRateSchedule, TFPolicyGraph): values_batched = to_batches(values) self.stats_fetches = { "stats": { - "model_loss": self.model.loss(), "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py index 3159de562..bd098e697 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py @@ -148,6 +148,8 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph): existing_state_in = None existing_seq_lens = None self.observations = obs_ph + self.prev_actions = prev_actions_ph + self.prev_rewards = prev_rewards_ph self.loss_in = [ ("obs", obs_ph), @@ -245,7 +247,8 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph): obs_input=obs_ph, action_sampler=self.sampler, action_prob=curr_action_dist.sampled_action_prob(), - loss=self.model.loss() + self.loss_obj.loss, + loss=self.loss_obj.loss, + model=self.model, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, @@ -289,7 +292,9 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph): next_state = [] for i in range(len(self.model.state_in)): next_state.append([sample_batch["state_out_{}".format(i)][-1]]) - last_r = self._value(sample_batch["new_obs"][-1], *next_state) + last_r = self._value(sample_batch["new_obs"][-1], + sample_batch["actions"][-1], + sample_batch["rewards"][-1], *next_state) batch = compute_advantages( sample_batch, last_r, @@ -336,8 +341,13 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph): self.kl_coeff.load(self.kl_coeff_val, session=self.sess) return self.kl_coeff_val - def _value(self, ob, *args): - feed_dict = {self.observations: [ob], self.model.seq_lens: [1]} + def _value(self, ob, prev_action, prev_reward, *args): + feed_dict = { + self.observations: [ob], + self.prev_actions: [prev_action], + self.prev_rewards: [prev_reward], + self.model.seq_lens: [1] + } assert len(args) == len(self.model.state_in), \ (args, self.model.state_in) for k, v in zip(self.model.state_in, args): diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py index 39e395cee..7a8d32345 100644 --- a/python/ray/rllib/evaluation/policy_evaluator.py +++ b/python/ray/rllib/evaluation/policy_evaluator.py @@ -8,10 +8,12 @@ import pickle import tensorflow as tf import ray -from ray.rllib.env.base_env import BaseEnv from ray.rllib.env.atari_wrappers import wrap_deepmind, is_atari +from ray.rllib.env.base_env import BaseEnv from ray.rllib.env.env_context import EnvContext +from ray.rllib.env.external_env import ExternalEnv from ray.rllib.env.multi_agent_env import MultiAgentEnv +from ray.rllib.env.vector_env import VectorEnv from ray.rllib.evaluation.interface import EvaluatorInterface from ray.rllib.evaluation.sample_batch import MultiAgentBatch, \ DEFAULT_POLICY_ID @@ -222,7 +224,7 @@ class PolicyEvaluator(EvaluatorInterface): self.compress_observations = compress_observations self.preprocessing_enabled = True - self.env = env_creator(env_context) + self.env = _validate_env(env_creator(env_context)) if isinstance(self.env, MultiAgentEnv) or \ isinstance(self.env, BaseEnv): @@ -701,6 +703,20 @@ def _validate_and_canonicalize(policy_graph, env): } +def _validate_env(env): + # allow this as a special case (assumed gym.Env) + if hasattr(env, "observation_space") and hasattr(env, "action_space"): + return env + + allowed_types = [gym.Env, MultiAgentEnv, ExternalEnv, VectorEnv, BaseEnv] + if not any(isinstance(env, tpe) for tpe in allowed_types): + raise ValueError( + "Returned env should be an instance of gym.Env, MultiAgentEnv, " + "ExternalEnv, VectorEnv, or BaseEnv. The provided env creator " + "function returned {} ({}).".format(env, type(env))) + return env + + def _monitor(env, path): return gym.wrappers.Monitor(env, path, resume=True) diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py index 232b869ec..377377a60 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph.py +++ b/python/ray/rllib/evaluation/tf_policy_graph.py @@ -32,6 +32,7 @@ class TFPolicyGraph(PolicyGraph): Attributes: observation_space (gym.Space): observation space of the policy. action_space (gym.Space): action space of the policy. + model (rllib.models.Model): RLlib model used for the policy. Examples: >>> policy = TFPolicyGraphSubclass( @@ -53,6 +54,7 @@ class TFPolicyGraph(PolicyGraph): action_sampler, loss, loss_inputs, + model=None, action_prob=None, state_inputs=None, state_outputs=None, @@ -79,6 +81,8 @@ class TFPolicyGraph(PolicyGraph): and has shape [BATCH_SIZE, data...]. These keys will be read from postprocessed sample batches and fed into the specified placeholders during loss computation. + model (rllib.models.Model): used to integrate custom losses and + stats from user-defined RLlib models. action_prob (Tensor): probability of the sampled action. state_inputs (list): list of RNN state input Tensors. state_outputs (list): list of RNN state output Tensors. @@ -98,12 +102,18 @@ class TFPolicyGraph(PolicyGraph): self.observation_space = observation_space self.action_space = action_space + self.model = model self._sess = sess self._obs_input = obs_input self._prev_action_input = prev_action_input self._prev_reward_input = prev_reward_input self._sampler = action_sampler - self._loss = loss + if self.model: + self._loss = self.model.custom_loss(loss) + self._stats_fetches = {"model": self.model.custom_stats()} + else: + self._loss = loss + self._stats_fetches = {} self._loss_inputs = loss_inputs self._loss_input_dict = dict(self._loss_inputs) self._is_training = self._get_is_training_placeholder() @@ -375,7 +385,7 @@ class TFPolicyGraph(PolicyGraph): builder.add_feed_dict({self._is_training: True}) builder.add_feed_dict(self._get_loss_inputs_dict(postprocessed_batch)) fetches = builder.add_fetches( - [self._grads, self.extra_compute_grad_fetches()]) + [self._grads, self._get_grad_and_stats_fetches()]) return fetches[0], fetches[1] def _build_apply_gradients(self, builder, gradients): @@ -397,11 +407,18 @@ class TFPolicyGraph(PolicyGraph): builder.add_feed_dict({self._is_training: True}) fetches = builder.add_fetches([ self._apply_op, - self.extra_compute_grad_fetches(), + self._get_grad_and_stats_fetches(), self.extra_apply_grad_fetches() ]) return fetches[1], fetches[2] + def _get_grad_and_stats_fetches(self): + fetches = self.extra_compute_grad_fetches() + if self._stats_fetches: + fetches["stats"] = dict(self._stats_fetches, + **fetches.get("stats", {})) + return fetches + def _get_loss_inputs_dict(self, batch): feed_dict = {} if self._batch_divisibility_req > 1: diff --git a/python/ray/rllib/examples/custom_loss.py b/python/ray/rllib/examples/custom_loss.py new file mode 100644 index 000000000..1b2e80cb0 --- /dev/null +++ b/python/ray/rllib/examples/custom_loss.py @@ -0,0 +1,100 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +"""Example of using custom_loss() with an imitation learning loss. + +The default input file is too small to learn a good policy, but you can +generate new experiences for IL training as follows: + +To generate experiences: +$ ./train.py --run=PG --config='{"output": "/tmp/cartpole"}' --env=CartPole-v0 + +To train on experiences with joint PG + IL loss: +$ python custom_loss.py --input-files=/tmp/cartpole +""" + +import argparse +import os +import tensorflow as tf + +import ray +from ray.tune import run_experiments +from ray.rllib.models import (Categorical, FullyConnectedNetwork, Model, + ModelCatalog) +from ray.rllib.models.model import restore_original_dimensions +from ray.rllib.offline import JsonReader + +parser = argparse.ArgumentParser() +parser.add_argument("--iters", type=int, default=200) +parser.add_argument( + "--input-files", + type=str, + default=os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "../test/data/cartpole_small")) + + +class CustomLossModel(Model): + """Custom model that adds an imitation loss on top of the policy loss.""" + + def _build_layers_v2(self, input_dict, num_outputs, options): + self.obs_in = input_dict["obs"] + self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space, + num_outputs, options) + return self.fcnet.outputs, self.fcnet.last_layer + + def custom_loss(self, policy_loss): + # create a new input reader per worker + reader = JsonReader(self.options["custom_options"]["input_files"]) + input_ops = reader.tf_input_ops() + + # define a secondary loss by building a graph copy with weight sharing + with tf.variable_scope( + self.scope, reuse=tf.AUTO_REUSE, auxiliary_name_scope=False): + logits, _ = self._build_layers_v2({ + "obs": restore_original_dimensions(input_ops["obs"], + self.obs_space) + }, self.num_outputs, self.options) + + # You can also add self-supervised losses easily by referencing tensors + # created during _build_layers_v2(). For example, an autoencoder-style + # loss can be added as follows: + # ae_loss = squared_diff(self.obs_in, Decoder(self.fcnet.last_layer)) + + # compute the IL loss + action_dist = Categorical(logits) + self.policy_loss = policy_loss + self.imitation_loss = tf.reduce_mean( + -action_dist.logp(input_ops["actions"])) + return policy_loss + 10 * self.imitation_loss + + def custom_stats(self): + return { + "policy_loss": self.policy_loss, + "imitation_loss": self.imitation_loss, + } + + +if __name__ == "__main__": + ray.init() + args = parser.parse_args() + + ModelCatalog.register_custom_model("custom_loss", CustomLossModel) + run_experiments({ + "custom_loss": { + "run": "PG", + "env": "CartPole-v0", + "stop": { + "training_iteration": args.iters, + }, + "config": { + "num_workers": 0, + "model": { + "custom_model": "custom_loss", + "custom_options": { + "input_files": args.input_files, + }, + }, + }, + }, + }) diff --git a/python/ray/rllib/examples/parametric_action_cartpole.py b/python/ray/rllib/examples/parametric_action_cartpole.py index 5af79c20e..8b32e5d9a 100644 --- a/python/ray/rllib/examples/parametric_action_cartpole.py +++ b/python/ray/rllib/examples/parametric_action_cartpole.py @@ -1,8 +1,7 @@ """Example of handling variable length and/or parametric action spaces. This is a toy example of the action-embedding based approach for handling large -discrete action spaces (potentially infinite in size), similar to how -OpenAI Five works: +discrete action spaces (potentially infinite in size), similar to this: https://neuro.cs.ut.ee/the-use-of-embeddings-in-openai-five/ diff --git a/python/ray/rllib/models/model.py b/python/ray/rllib/models/model.py index 615b149e8..cb2c7c6aa 100644 --- a/python/ray/rllib/models/model.py +++ b/python/ray/rllib/models/model.py @@ -9,7 +9,7 @@ import tensorflow as tf from ray.rllib.models.misc import linear, normc_initializer from ray.rllib.models.preprocessors import get_preprocessor -from ray.rllib.utils.annotations import PublicAPI +from ray.rllib.utils.annotations import PublicAPI, DeveloperAPI @PublicAPI @@ -58,6 +58,11 @@ class Model(object): self.state_init = [] self.state_in = state_in or [] self.state_out = [] + self.obs_space = obs_space + self.num_outputs = num_outputs + self.options = options + self.scope = tf.get_variable_scope() + self.session = tf.get_default_session() if seq_lens is not None: self.seq_lens = seq_lens else: @@ -69,9 +74,11 @@ class Model(object): assert num_outputs % 2 == 0 num_outputs = num_outputs // 2 try: + restored = input_dict.copy() + restored["obs"] = restore_original_dimensions( + input_dict["obs"], obs_space) self.outputs, self.last_layer = self._build_layers_v2( - _restore_original_dimensions(input_dict, obs_space), - num_outputs, options) + restored, num_outputs, options) except NotImplementedError: self.outputs, self.last_layer = self._build_layers( input_dict["obs"], num_outputs, options) @@ -139,17 +146,46 @@ class Model(object): linear(self.last_layer, 1, "value", normc_initializer(1.0)), [-1]) @PublicAPI - def loss(self): - """Builds any built-in (self-supervised) loss for the model. + def custom_loss(self, policy_loss): + """Override to customize the loss function used to optimize this model. - For example, this can be used to incorporate auto-encoder style losses. - Note that this loss has to be included in the policy graph loss to have - an effect (done for built-in algorithms). + This can be used to incorporate self-supervised losses (by defining + a loss over existing input and output tensors of this model), and + supervised losses (by defining losses over a variable-sharing copy of + this model's layers). + + You can find an runnable example in examples/custom_loss.py. + + Arguments: + policy_loss (Tensor): scalar policy loss from the policy graph. Returns: - Scalar tensor for the self-supervised loss. + Scalar tensor for the customized loss for this model. """ - return tf.constant(0.0) + if self.loss() is not None: + raise DeprecationWarning( + "self.loss() is deprecated, use self.custom_loss() instead.") + return policy_loss + + @PublicAPI + def custom_stats(self): + """Override to return custom metrics from your model. + + The stats will be reported as part of the learner stats, i.e., + info: + learner: + model: + key1: metric1 + key2: metric2 + + Returns: + Dict of string keys to scalar tensors. + """ + return {} + + def loss(self): + """Deprecated: use self.custom_loss().""" + return None def _validate_output_shape(self): """Checks that the model has the correct number of outputs.""" @@ -165,15 +201,29 @@ class Model(object): self._num_outputs, shape)) -def _restore_original_dimensions(input_dict, obs_space, tensorlib=tf): +@DeveloperAPI +def restore_original_dimensions(obs, obs_space, tensorlib=tf): + """Unpacks Dict and Tuple space observations into their original form. + + This is needed since we flatten Dict and Tuple observations in transit. + Before sending them to the model though, we should unflatten them into + Dicts or Tuples of tensors. + + Arguments: + obs: The flattened observation tensor. + obs_space: The flattened obs space. If this has the `original_space` + attribute, we will unflatten the tensor to that shape. + tensorlib: The library used to unflatten (reshape) the array/tensor. + + Returns: + single tensor or dict / tuple of tensors matching the original + observation space. + """ + if hasattr(obs_space, "original_space"): - return dict( - input_dict, - obs=_unpack_obs( - input_dict["obs"], - obs_space.original_space, - tensorlib=tensorlib)) - return input_dict + return _unpack_obs(obs, obs_space.original_space, tensorlib=tensorlib) + else: + return obs def _unpack_obs(obs, space, tensorlib=tf): diff --git a/python/ray/rllib/models/pytorch/model.py b/python/ray/rllib/models/pytorch/model.py index 2fcf88a04..48b9587a7 100644 --- a/python/ray/rllib/models/pytorch/model.py +++ b/python/ray/rllib/models/pytorch/model.py @@ -5,7 +5,7 @@ from __future__ import print_function import torch import torch.nn as nn -from ray.rllib.models.model import _restore_original_dimensions +from ray.rllib.models.model import restore_original_dimensions from ray.rllib.utils.annotations import PublicAPI @@ -31,8 +31,8 @@ class TorchModel(nn.Module): def forward(self, input_dict, hidden_state): """Wraps _forward() to unpack flattened Dict and Tuple observations.""" input_dict["obs"] = input_dict["obs"].float() # TODO(ekl): avoid cast - input_dict = _restore_original_dimensions( - input_dict, self.obs_space, tensorlib=torch) + input_dict["obs"] = restore_original_dimensions( + input_dict["obs"], self.obs_space, tensorlib=torch) outputs, features, vf, h = self._forward(input_dict, hidden_state) return outputs, features, vf, h diff --git a/python/ray/rllib/offline/input_reader.py b/python/ray/rllib/offline/input_reader.py index 073cf3148..b45126609 100644 --- a/python/ray/rllib/offline/input_reader.py +++ b/python/ray/rllib/offline/input_reader.py @@ -2,8 +2,16 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import logging +import numpy as np +import tensorflow as tf +import threading + +from ray.rllib.evaluation.sample_batch import MultiAgentBatch from ray.rllib.utils.annotations import PublicAPI +logger = logging.getLogger(__name__) + @PublicAPI class InputReader(object): @@ -17,3 +25,97 @@ class InputReader(object): SampleBatch or MultiAgentBatch read. """ raise NotImplementedError + + @PublicAPI + def tf_input_ops(self, queue_size=1): + """Returns TensorFlow queue ops for reading inputs from this reader. + + The main use of these ops is for integration into custom model losses. + For example, you can use tf_input_ops() to read from files of external + experiences to add an imitation learning loss to your model. + + This method creates a queue runner thread that will call next() on this + reader repeatedly to feed the TensorFlow queue. + + Arguments: + queue_size (int): Max elements to allow in the TF queue. + + Example: + >>> class MyModel(rllib.model.Model): + ... def custom_loss(self, policy_loss): + ... reader = JsonReader(...) + ... input_ops = reader.tf_input_ops() + ... with tf.variable_scope( + ... self.scope, reuse=tf.AUTO_REUSE, + ... auxiliary_name_scope=False): + ... logits, _ = self._build_layers_v2( + ... {"obs": input_ops["obs"]}, + ... self.num_outputs, self.options) + ... il_loss = imitation_loss(logits, input_ops["action"]) + ... return policy_loss + il_loss + + You can find a runnable version of this in examples/custom_loss.py. + + Returns: + dict of Tensors, one for each column of the read SampleBatch. + """ + + if hasattr(self, "_queue_runner"): + raise ValueError( + "A queue runner already exists for this input reader. " + "You can only call tf_input_ops() once per reader.") + + logger.info("Reading initial batch of data from input reader.") + batch = self.next() + if isinstance(batch, MultiAgentBatch): + raise NotImplementedError( + "tf_input_ops() is not implemented for multi agent batches") + + keys = [ + k for k in sorted(list(batch.keys())) + if np.issubdtype(batch[k].dtype, np.number) + ] + dtypes = [batch[k].dtype for k in keys] + shapes = { + k: (-1, ) + s[1:] + for (k, s) in [(k, batch[k].shape) for k in keys] + } + queue = tf.FIFOQueue(capacity=queue_size, dtypes=dtypes, names=keys) + tensors = queue.dequeue() + + logger.info("Creating TF queue runner for {}".format(self)) + self._queue_runner = _QueueRunner(self, queue, keys, dtypes) + self._queue_runner.enqueue(batch) + self._queue_runner.start() + + out = {k: tf.reshape(t, shapes[k]) for k, t in tensors.items()} + return out + + +class _QueueRunner(threading.Thread): + """Thread that feeds a TF queue from a InputReader.""" + + def __init__(self, input_reader, queue, keys, dtypes): + threading.Thread.__init__(self) + self.sess = tf.get_default_session() + self.daemon = True + self.input_reader = input_reader + self.keys = keys + self.queue = queue + self.placeholders = [tf.placeholder(dtype) for dtype in dtypes] + self.enqueue_op = queue.enqueue(dict(zip(keys, self.placeholders))) + + def enqueue(self, batch): + data = { + self.placeholders[i]: batch[key] + for i, key in enumerate(self.keys) + } + self.sess.run(self.enqueue_op, feed_dict=data) + + def run(self): + while True: + try: + batch = self.input_reader.next() + self.enqueue(batch) + except Exception: + logger.exception("Error reading from input") diff --git a/python/ray/tune/trial_runner.py b/python/ray/tune/trial_runner.py index 3149cab13..8d3ef320b 100644 --- a/python/ray/tune/trial_runner.py +++ b/python/ray/tune/trial_runner.py @@ -311,9 +311,10 @@ class TrialRunner(object): for state, trials in states.items() } total_number_of_trials = sum(num_trials_per_state.values()) - messages.append("Number of trials: {} ({})" - "".format(total_number_of_trials, - num_trials_per_state)) + if total_number_of_trials > 0: + messages.append("Number of trials: {} ({})" + "".format(total_number_of_trials, + num_trials_per_state)) for state, trials in sorted(states.items()): limit = limit_per_state[state]