[rllib] Add the ability to run arbitrary Python scripts with ray.tune (#1132)

* fix yaml bug * add ext agent * gpus * update * tuning * docs * Sun Oct 15 21:09:25 PDT 2017 * lint * update * Sun Oct 15 22:39:55 PDT 2017 * Sun Oct 15 22:40:17 PDT 2017 * Sun Oct 15 22:43:06 PDT 2017 * Sun Oct 15 22:46:06 PDT 2017 * Sun Oct 15 22:46:21 PDT 2017 * Sun Oct 15 22:48:11 PDT 2017 * Sun Oct 15 22:48:44 PDT 2017 * Sun Oct 15 22:49:23 PDT 2017 * Sun Oct 15 22:50:21 PDT 2017 * Sun Oct 15 22:53:00 PDT 2017 * Sun Oct 15 22:53:34 PDT 2017 * Sun Oct 15 22:54:33 PDT 2017 * Sun Oct 15 22:54:50 PDT 2017 * Sun Oct 15 22:55:20 PDT 2017 * Sun Oct 15 22:56:56 PDT 2017 * Sun Oct 15 22:59:03 PDT 2017 * fix * Update tune_mnist_ray.py * remove script trial * fix * reorder * fix ex * py2 support * upd * comments * comments * cleanup readme * fix trial * annotate * Update rllib.rst
2026-07-05 03:56:12 +08:00 · 2017-10-18 11:49:28 -07:00
parent 4157bcb80b
commit 5a50e0e1d7
24 changed files with 745 additions and 166 deletions
@@ -9,11 +9,12 @@ import six.moves.queue as queue
 import os

 import ray
+from ray.rllib.agent import Agent
 from ray.rllib.a3c.runner import RunnerThread, process_rollout
 from ray.rllib.a3c.envs import create_and_wrap
-from ray.rllib.common import Agent, TrainingResult
 from ray.rllib.a3c.shared_model import SharedModel
 from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM
+from ray.tune.result import TrainingResult


 DEFAULT_CONFIG = {
@@ -2,7 +2,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from collections import namedtuple
 from datetime import datetime

 import json
@@ -16,6 +15,7 @@ import time
 import uuid

 import tensorflow as tf
+from ray.tune.result import TrainingResult

 if sys.version_info[0] == 2:
    import cStringIO as StringIO
@@ -26,39 +26,6 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)


-TrainingResult = namedtuple("TrainingResult", [
-    # Unique string identifier for this experiment. This id is preserved
-    # across checkpoint / restore calls.
-    "experiment_id",
-
-    # The index of this training iteration, e.g. call to train().
-    "training_iteration",
-
-    # The mean episode reward reported during this iteration.
-    "episode_reward_mean",
-
-    # The mean episode length reported during this iteration.
-    "episode_len_mean",
-
-    # Agent-specific metadata to report for this iteration.
-    "info",
-
-    # Number of timesteps in the simulator in this iteration.
-    "timesteps_this_iter",
-
-    # Accumulated timesteps for this entire experiment.
-    "timesteps_total",
-
-    # Time in seconds this iteration took to run.
-    "time_this_iter_s",
-
-    # Accumulated time in seconds for this entire experiment.
-    "time_total_s",
-])
-
-TrainingResult.__new__.__defaults__ = (None,) * len(TrainingResult._fields)
-
-
 class Agent(object):
    """All RLlib agents extend this base class.

@@ -71,6 +38,8 @@ class Agent(object):
        logdir (str): Directory in which training outputs should be placed.
    """

+    _allow_unknown_configs = False
+
    def __init__(
            self, env_creator, config, local_dir='/tmp/ray',
            upload_dir=None, agent_id=None):
@@ -97,11 +66,12 @@ class Agent(object):
            self.env_creator = env_creator

        self.config = self._default_config.copy()
-        for k in config.keys():
-            if k not in self.config:
-                raise Exception(
-                    "Unknown agent config `{}`, "
-                    "all agent configs: {}".format(k, self.config.keys()))
+        if not self._allow_unknown_configs:
+            for k in config.keys():
+                if k not in self.config:
+                    raise Exception(
+                        "Unknown agent config `{}`, "
+                        "all agent configs: {}".format(k, self.config.keys()))
        self.config.update(config)
        self.config.update({
            "agent_id": agent_id,
@@ -112,7 +82,7 @@ class Agent(object):

        logdir_suffix = "{}_{}_{}".format(
            env_name,
-            self.__class__.__name__,
+            self._agent_name,
            agent_id or datetime.today().strftime("%Y-%m-%d_%H-%M-%S"))

        if not os.path.exists(local_dir):
@@ -128,12 +98,12 @@ class Agent(object):
        # TODO(ekl) consider inlining config into the result jsons
        config_out = os.path.join(self.logdir, "config.json")
        with open(config_out, "w") as f:
-            json.dump(self.config, f, sort_keys=True, cls=RLLibEncoder)
+            json.dump(self.config, f, sort_keys=True, cls=_Encoder)
        logger.info(
-            "%s algorithm created with logdir '%s' and upload uri '%s'",
+            "%s agent created with logdir '%s' and upload uri '%s'",
            self.__class__.__name__, self.logdir, log_upload_uri)

-        self._result_logger = RLLibLogger(
+        self._result_logger = _Logger(
            os.path.join(self.logdir, "result.json"),
            log_upload_uri and os.path.join(log_upload_uri, "result.json"))
        self._file_writer = tf.summary.FileWriter(self.logdir)
@@ -162,6 +132,8 @@ class Agent(object):
        self._iteration += 1
        time_this_iter = time.time() - start

+        assert result.timesteps_this_iter is not None
+
        self._time_total += time_this_iter
        self._timesteps_total += result.timesteps_this_iter

@@ -170,10 +142,9 @@ class Agent(object):
            training_iteration=self._iteration,
            timesteps_total=self._timesteps_total,
            time_this_iter_s=time_this_iter,
-            time_total_s=self._time_total)
-
-        for field in result:
-            assert field is not None, result
+            time_total_s=self._time_total,
+            pid=os.getpid(),
+            hostname=os.uname()[1])

        self._log_result(result)

@@ -184,18 +155,18 @@ class Agent(object):

        # We need to use a custom json serializer class so that NaNs get
        # encoded as null as required by Athena.
-        json.dump(result._asdict(), self._result_logger, cls=RLLibEncoder)
+        json.dump(result._asdict(), self._result_logger, cls=_Encoder)
        self._result_logger.write("\n")
-        train_stats = tf.Summary(value=[
-            tf.Summary.Value(
-                tag="rllib/time_this_iter_s",
-                simple_value=result.time_this_iter_s),
-            tf.Summary.Value(
-                tag="rllib/episode_reward_mean",
-                simple_value=result.episode_reward_mean),
-            tf.Summary.Value(
-                tag="rllib/episode_len_mean",
-                simple_value=result.episode_len_mean)])
+        attrs_to_log = [
+            "time_this_iter_s", "mean_loss", "mean_accuracy",
+            "episode_reward_mean", "episode_len_mean"]
+        values = []
+        for attr in attrs_to_log:
+            if getattr(result, attr) is not None:
+                values.append(tf.Summary.Value(
+                    tag="ray/tune/{}".format(attr),
+                    simple_value=getattr(result, attr)))
+        train_stats = tf.Summary(value=values)
        self._file_writer.add_summary(train_stats, result.training_iteration)

    def save(self):
@@ -269,10 +240,10 @@ class Agent(object):
        raise NotImplementedError


-class RLLibEncoder(json.JSONEncoder):
+class _Encoder(json.JSONEncoder):

    def __init__(self, nan_str="null", **kwargs):
-        super(RLLibEncoder, self).__init__(**kwargs)
+        super(_Encoder, self).__init__(**kwargs)
        self.nan_str = nan_str

    def iterencode(self, o, _one_shot=False):
@@ -299,7 +270,7 @@ class RLLibEncoder(json.JSONEncoder):
            return int(value)


-class RLLibLogger(object):
+class _Logger(object):
    """Writing small amounts of data to S3 with real-time updates.
    """

@@ -322,3 +293,44 @@ class RLLibLogger(object):
            with self.smart_open(self.uri, "w") as f:
                self.result_buffer.write(b)
                f.write(self.result_buffer.getvalue())
+
+
+class _MockAgent(Agent):
+    """Mock agent for use in tests"""
+
+    _agent_name = "MockAgent"
+    _default_config = {}
+
+    def _init(self):
+        pass
+
+    def _train(self):
+        return TrainingResult(
+            episode_reward_mean=10, episode_len_mean=10,
+            timesteps_this_iter=10, info={})
+
+
+def get_agent_class(alg):
+    """Returns the class of an known agent given its name."""
+
+    if alg == "PPO":
+        from ray.rllib import ppo
+        return ppo.PPOAgent
+    elif alg == "ES":
+        from ray.rllib import es
+        return es.ESAgent
+    elif alg == "DQN":
+        from ray.rllib import dqn
+        return dqn.DQNAgent
+    elif alg == "A3C":
+        from ray.rllib import a3c
+        return a3c.A3CAgent
+    elif alg == "script":
+        from ray.tune import script_runner
+        return script_runner.ScriptRunner
+    elif alg == "__fake":
+        return _MockAgent
+    else:
+        raise Exception(
+            ("Unknown algorithm {}, check --alg argument. Valid choices " +
+             "are PPO, ES, DQN, and A3C.").format(alg))
@@ -1,43 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ray.rllib.common import Agent, TrainingResult
-
-
-class _MockAgent(Agent):
-    """Mock agent for use in tests"""
-
-    _agent_name = "MockAgent"
-    _default_config = {}
-
-    def _init(self):
-        pass
-
-    def _train(self):
-        return TrainingResult(
-            episode_reward_mean=10, episode_len_mean=10,
-            timesteps_this_iter=10, info={})
-
-
-def get_agent_class(alg):
-    """Returns the class of an known agent given its name."""
-
-    if alg == "PPO":
-        from ray.rllib import ppo
-        return ppo.PPOAgent
-    elif alg == "ES":
-        from ray.rllib import es
-        return es.ESAgent
-    elif alg == "DQN":
-        from ray.rllib import dqn
-        return dqn.DQNAgent
-    elif alg == "A3C":
-        from ray.rllib import a3c
-        return a3c.A3CAgent
-    elif alg == "__fake":
-        return _MockAgent
-    else:
-        raise Exception(
-            ("Unknown algorithm {}, check --alg argument. Valid choices " +
-             "are PPO, ES, DQN, and A3C.").format(alg))
@@ -10,11 +10,12 @@ import os
 import tensorflow as tf

 import ray
-from ray.rllib.common import Agent, TrainingResult
+from ray.rllib.agent import Agent
 from ray.rllib.dqn import logger, models
 from ray.rllib.dqn.common.wrappers import wrap_dqn
 from ray.rllib.dqn.common.schedules import LinearSchedule
 from ray.rllib.dqn.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
+from ray.tune.result import TrainingResult


 """The default configuration dict for the DQN algorithm.
@@ -12,7 +12,7 @@ import pickle
 import time

 import ray
-from ray.rllib.common import Agent, TrainingResult
+from ray.rllib.agent import Agent
 from ray.rllib.models import ModelCatalog

 from ray.rllib.es import optimizers
@@ -20,6 +20,7 @@ from ray.rllib.es import policies
 from ray.rllib.es import tabular_logger as tlogger
 from ray.rllib.es import tf_util
 from ray.rllib.es import utils
+from ray.tune.result import TrainingResult


 Result = namedtuple("Result", [
@@ -11,7 +11,8 @@ import tensorflow as tf
 from tensorflow.python import debug as tf_debug

 import ray
-from ray.rllib.common import Agent, TrainingResult
+from ray.rllib.agent import Agent
+from ray.tune.result import TrainingResult
 from ray.rllib.ppo.runner import Runner, RemoteRunner
 from ray.rllib.ppo.rollout import collect_samples
 from ray.rllib.ppo.utils import shuffle
@@ -8,7 +8,7 @@ import numpy as np
 import ray
 import random

-from ray.rllib.agents import get_agent_class
+from ray.rllib.agent import get_agent_class


 def get_mean_action(alg, obs):
@@ -34,14 +34,18 @@ parser = make_parser("Train a reinforcement learning agent.")
 # defined there.
 parser.add_argument("--redis-address", default=None, type=str,
                    help="The Redis address of the cluster.")
+parser.add_argument("--num-cpus", default=None, type=int,
+                    help="Number of CPUs to allocate to Ray.")
+parser.add_argument("--num-gpus", default=None, type=int,
+                    help="Number of GPUs to allocate to Ray.")
 parser.add_argument("--restore", default=None, type=str,
                    help="If specified, restore from this checkpoint.")
 parser.add_argument("-f", "--config-file", default=None, type=str,
                    help="If specified, use config options from this file.")


-if __name__ == "__main__":
-    args = parser.parse_args()
+def main(argv):
+    args = parser.parse_args(argv)
    runner = TrialRunner()

    if args.config_file:
@@ -56,7 +60,9 @@ if __name__ == "__main__":
                args.resources, args.stop, args.checkpoint_freq,
                args.restore, args.upload_dir))

-    ray.init(redis_address=args.redis_address)
+    ray.init(
+        redis_address=args.redis_address, num_cpus=args.num_cpus,
+        num_gpus=args.num_gpus)

    while not runner.is_finished():
        runner.step()
@@ -64,4 +70,11 @@ if __name__ == "__main__":

    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
+            print("Exit 1")
            sys.exit(1)
+
+    print("Exit 0")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
@@ -5,4 +5,4 @@ hopper-ppo:
    resources:
       cpu: 64
       gpu: 4
-    config: {"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}
+    config: {"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}
@@ -7,5 +7,5 @@ humanoid-ppo-gae:
    resources:
       cpu: 64
       gpu: 4
-    config: {"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}
+    config: {"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}

@@ -7,4 +7,4 @@ humanoid-ppo:
    resources:
       cpu: 64
       gpu: 4
-    config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}
+    config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}
@@ -5,4 +5,4 @@ walker2d-v1-ppo:
    resources:
       cpu: 64
       gpu: 4
-    config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": 1e-4, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}
+    config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}
@@ -1,24 +1,123 @@
-Ray.tune: Fast hyperparameter search
-====================================
+Parallel hyperparameter evaluation with Ray
+===========================================

-Using ray.tune with RLlib
-------------------------
+Using ray.tune for deep neural network training
+-----------------------------------------------

-One way to use ray.tune is through RLlib's train.py script. The train.py script
-supports two modes. For example, to run multiple concurrent trials of Pong:
+With only a couple changes, you can parallelize evaluation of any existing
+Python script with Ray.tune.

- Inline args: ``./train.py --env=Pong-v0 --alg=PPO --num_trials=8 --stop '{"time_total_s": 3200}' --resources '{"cpu": 8, "gpu": 2}' --config '{"num_workers": 8, "sgd_num_iter": 10}'``
+First, you must define a ``train(config, status_reporter)`` function in your
+script. This will be the entry point which Ray will call into.

- File-based: ``./train.py -f tune-pong.yaml``
+.. code:: python

-Both delegate scheduling of trials to the ray.tune TrialRunner class.
-Additionally, the file-based mode supports hyper-parameter tuning
-(currently just grid and random search).
+    def train(config, status_reporter):
+        pass

-To specify search parameters, variables in the `config` section may be set to
-different values for each trial. You can either specify `grid_search: <list>`
+Second, you should periodically report training status by passing a
+``TrainingResult`` tuple to ``status_reporter.report()``.
+
+.. code:: python
+    
+    from ray.tune.result import TrainingResult
+
+    def train(config, status_reporter):
+        for step in range(1000):
+            # do a training iteration
+            status_reporter.report(TrainingResult(
+                timesteps_total=step,  # required
+                mean_loss=train_loss,  # optional
+                mean_accuracy=train_accuracy  # optional
+            ))
+
+You can then launch a hyperparameter tuning run by running ``tune.py``.
+For example:
+
+.. code:: bash
+
+    cd python/ray/tune
+    ./tune.py -f examples/tune_mnist_ray.yaml
+
+The YAML or JSON file passed to ``tune.py`` specifies the configuration of the
+trials to launch. For example, the following YAML describes a grid search over
+activation functions.
+
+.. code:: yaml
+
+    tune_mnist:
+        env: mnist
+        alg: script
+        num_trials: 10
+        resources:
+            cpu: 1
+        stop:
+            mean_accuracy: 0.99
+            time_total_s: 600
+        config:
+            script_file_path: examples/tune_mnist_ray.py
+            script_entrypoint: train
+            activation:
+                grid_search: ['relu', 'elu', 'tanh']
+
+When run, ``./tune.py`` will schedule the trials on Ray, creating a new local
+Ray cluster if an existing cluster address is not specified. Incremental
+status will be reported on the command line, and you can also view the reported
+metrics using Tensorboard:
+
+.. code:: text
+
+    == Status ==
+    Resources used: 4/4 CPUs, 0/0 GPUs
+    Tensorboard logdir: /tmp/ray/tune_mnist
+     - script_mnist_0_activation=relu:	RUNNING [pid=27708], 16 s, 20 ts, 0.46 acc
+     - script_mnist_1_activation=elu:	RUNNING [pid=27709], 16 s, 20 ts, 0.54 acc
+     - script_mnist_2_activation=tanh:	RUNNING [pid=27711], 18 s, 20 ts, 0.74 acc
+     - script_mnist_3_activation=relu:	RUNNING [pid=27713], 12 s, 10 ts, 0.22 acc
+     - script_mnist_4_activation=elu:	PENDING
+     - script_mnist_5_activation=tanh:	PENDING
+     - script_mnist_6_activation=relu:	PENDING
+     - script_mnist_7_activation=elu:	PENDING
+     - script_mnist_8_activation=tanh:	PENDING
+     - script_mnist_9_activation=relu:	PENDING
+
+Note that if your script requires GPUs, you should specify the number of gpus
+required per trial in the ``resources`` section. Additionally, Ray should be
+initialized with the ``--num-gpus`` argument (you can also pass this argument
+to ``tune.py``).
+
+Using ray.tune as a library
+---------------------------
+
+Ray.tune can also be called programmatically from Python code. This allows for
+finer-grained control over trial setup and scheduling. Some examples of
+calling ray.tune programmatically include:
+
+- ``python/ray/tune/examples/tune_mnist_ray.py``
+- ``python/ray/rllib/train.py``
+
+Using ray.tune with Ray RLlib
+-----------------------------
+
+Another way to use ray.tune is through RLlib's ``python/ray/rllib/train.py``
+script. This script allows you to select between different RL algorithms with
+the ``--alg`` option. For example, to train pong with the A3C algorithm, run:
+
+- ``./train.py --env=PongDeterministic-v4 --alg=A3C --num-trials=8 --stop '{"time_total_s": 3200}' --resources '{"cpu": 8}' --config '{"num_workers": 8}'``
+
+or
+
+- ``./train.py -f tuned_examples/pong-a3c.yaml``
+
+You can find more RLlib examples in ``python/ray/rllib/tuned_examples``.
+
+Specifying search parameters
+----------------------------
+
+To specify search parameters, variables in the ``config`` section may be set to
+different values for each trial. You can either specify ``grid_search: <list>``
 in place of a concrete value to specify a grid search across the list of
-values, or `eval: <str>` for values to be sampled from the given Python
+values, or ``eval: <str>`` for values to be sampled from the given Python
 expression.

 .. code:: yaml
@@ -40,15 +139,3 @@ expression.
                grid_search: [128, 256, 512]
            lr:
                eval: random.uniform(1e-4, 1e-3)
-
-See ray/rllib/tuned_examples for more examples of configs in YAML form.
-
-Using ray.tune to run custom scripts
------------------------------------
-
-TODO
-
-Using ray.tune as a library
---------------------------
-
-TODO
@@ -8,7 +8,6 @@ import json
 import numpy as np
 import os
 import random
-import sys

 from ray.tune.trial import Trial, Resources

@@ -32,13 +31,13 @@ def make_parser(description):
    parser.add_argument("--resources", default='{"cpu": 1}',
                        type=_resource_json,
                        help="Amount of resources to allocate per trial.")
-    parser.add_argument("--num_trials", default=1, type=int,
+    parser.add_argument("--num-trials", default=1, type=int,
                        help="Number of trials to evaluate.")
-    parser.add_argument("--local_dir", default="/tmp/ray", type=str,
+    parser.add_argument("--local-dir", default="/tmp/ray", type=str,
                        help="Local dir to save training results to.")
-    parser.add_argument("--upload_dir", default=None, type=str,
+    parser.add_argument("--upload-dir", default=None, type=str,
                        help="URI to upload training results to.")
-    parser.add_argument("--checkpoint_freq", default=sys.maxsize, type=int,
+    parser.add_argument("--checkpoint-freq", default=None, type=int,
                        help="How many iterations between checkpoints.")

    # TODO(ekl) environments are RL specific
@@ -73,7 +72,7 @@ def parse_to_trials(config):
    def to_argv(config):
        argv = []
        for k, v in config.items():
-            argv.append("--{}".format(k))
+            argv.append("--{}".format(k.replace("_", "-")))
            if type(v) is str:
                argv.append(v)
            else:
@@ -0,0 +1,223 @@
+#!/usr/bin/env python
+#
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A deep MNIST classifier using convolutional layers.
+
+See extensive documentation at
+https://www.tensorflow.org/get_started/mnist/pros
+"""
+# Disable linter warnings to maintain consistency with tutorial.
+# pylint: disable=invalid-name
+# pylint: disable=g-bad-import-order
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+import tempfile
+import os
+
+import ray
+from ray.tune.result import TrainingResult
+from ray.tune.trial import Trial
+from ray.tune.trial_runner import TrialRunner
+
+from tensorflow.examples.tutorials.mnist import input_data
+
+import tensorflow as tf
+
+FLAGS = None
+status_reporter = None  # used to report training status back to Ray
+activation_fn = None    # e.g. tf.nn.relu
+
+
+def deepnn(x):
+    """deepnn builds the graph for a deep net for classifying digits.
+
+    Args:
+        x: an input tensor with the dimensions (N_examples, 784), where 784 is
+        the number of pixels in a standard MNIST image.
+
+    Returns:
+        A tuple (y, keep_prob). y is a tensor of shape (N_examples, 10), with
+        values equal to the logits of classifying the digit into one of 10
+        classes (the digits 0-9). keep_prob is a scalar placeholder for the
+        probability of dropout.
+    """
+    # Reshape to use within a convolutional neural net.
+    # Last dimension is for "features" - there is only one here, since images
+    # are grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
+    with tf.name_scope('reshape'):
+        x_image = tf.reshape(x, [-1, 28, 28, 1])
+
+    # First convolutional layer - maps one grayscale image to 32 feature maps.
+    with tf.name_scope('conv1'):
+        W_conv1 = weight_variable([5, 5, 1, 32])
+        b_conv1 = bias_variable([32])
+        h_conv1 = activation_fn(conv2d(x_image, W_conv1) + b_conv1)
+
+    # Pooling layer - downsamples by 2X.
+    with tf.name_scope('pool1'):
+        h_pool1 = max_pool_2x2(h_conv1)
+
+    # Second convolutional layer -- maps 32 feature maps to 64.
+    with tf.name_scope('conv2'):
+        W_conv2 = weight_variable([5, 5, 32, 64])
+        b_conv2 = bias_variable([64])
+        h_conv2 = activation_fn(conv2d(h_pool1, W_conv2) + b_conv2)
+
+    # Second pooling layer.
+    with tf.name_scope('pool2'):
+        h_pool2 = max_pool_2x2(h_conv2)
+
+    # Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
+    # is down to 7x7x64 feature maps -- maps this to 1024 features.
+    with tf.name_scope('fc1'):
+        W_fc1 = weight_variable([7 * 7 * 64, 1024])
+        b_fc1 = bias_variable([1024])
+
+        h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
+        h_fc1 = activation_fn(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
+
+    # Dropout - controls the complexity of the model, prevents co-adaptation of
+    # features.
+    with tf.name_scope('dropout'):
+        keep_prob = tf.placeholder(tf.float32)
+        h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
+
+    # Map the 1024 features to 10 classes, one for each digit
+    with tf.name_scope('fc2'):
+        W_fc2 = weight_variable([1024, 10])
+        b_fc2 = bias_variable([10])
+
+        y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
+    return y_conv, keep_prob
+
+
+def conv2d(x, W):
+    """conv2d returns a 2d convolution layer with full stride."""
+    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
+
+
+def max_pool_2x2(x):
+    """max_pool_2x2 downsamples a feature map by 2X."""
+    return tf.nn.max_pool(
+        x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
+
+
+def weight_variable(shape):
+    """weight_variable generates a weight variable of a given shape."""
+    initial = tf.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial)
+
+
+def bias_variable(shape):
+    """bias_variable generates a bias variable of a given shape."""
+    initial = tf.constant(0.1, shape=shape)
+    return tf.Variable(initial)
+
+
+def main(_):
+    # Import data
+    mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+
+    # Create the model
+    x = tf.placeholder(tf.float32, [None, 784])
+
+    # Define loss and optimizer
+    y_ = tf.placeholder(tf.float32, [None, 10])
+
+    # Build the graph for the deep net
+    y_conv, keep_prob = deepnn(x)
+
+    with tf.name_scope('loss'):
+        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+            labels=y_, logits=y_conv)
+    cross_entropy = tf.reduce_mean(cross_entropy)
+
+    with tf.name_scope('adam_optimizer'):
+        train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
+
+    with tf.name_scope('accuracy'):
+        correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
+        correct_prediction = tf.cast(correct_prediction, tf.float32)
+    accuracy = tf.reduce_mean(correct_prediction)
+
+    graph_location = tempfile.mkdtemp()
+    print('Saving graph to: %s' % graph_location)
+    train_writer = tf.summary.FileWriter(graph_location)
+    train_writer.add_graph(tf.get_default_graph())
+
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        for i in range(20000):
+            batch = mnist.train.next_batch(50)
+            if i % 10 == 0:
+                train_accuracy = accuracy.eval(feed_dict={
+                        x: batch[0], y_: batch[1], keep_prob: 1.0})
+
+                # !!! Report status to ray.tune !!!
+                if status_reporter:
+                    status_reporter.report(TrainingResult(
+                        timesteps_total=i,
+                        mean_accuracy=train_accuracy))
+
+                print('step %d, training accuracy %g' % (i, train_accuracy))
+            train_step.run(
+                feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
+
+        print('test accuracy %g' % accuracy.eval(feed_dict={
+                x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
+
+
+# !!! Entrypoint for ray.tune !!!
+def train(config={'activation': 'relu'}, reporter=None):
+    global FLAGS, status_reporter, activation_fn
+    status_reporter = reporter
+    activation_fn = getattr(tf.nn, config['activation'])
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
+        help='Directory for storing input data')
+    FLAGS, unparsed = parser.parse_known_args()
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+
+
+# !!! Example of using the ray.tune Python API !!!
+if __name__ == '__main__':
+    runner = TrialRunner()
+
+    for act in ['relu', 'elu', 'tanh']:
+        runner.add_trial(
+            Trial(
+                'mnist', 'script',
+                stopping_criterion={
+                    'mean_accuracy': 0.99, 'time_total_s': 600},
+                config={
+                    'script_file_path': os.path.abspath(__file__),
+                    'script_min_iter_time_s': 1,
+                    'activation': act,
+                },
+                agent_id='act={}'.format(act)))
+
+    ray.init()
+
+    while not runner.is_finished():
+        runner.step()
+        print(runner.debug_string())
@@ -0,0 +1,15 @@
+tune_mnist:
+    env: mnist
+    alg: script
+    num_trials: 10
+    resources:
+        cpu: 1
+    stop:
+        mean_accuracy: 0.99
+        time_total_s: 600
+    config:
+        script_file_path: examples/tune_mnist_ray.py
+        script_entrypoint: train
+        script_min_iter_time_s: 1
+        activation:
+            grid_search: ['relu', 'elu', 'tanh']
@@ -0,0 +1,62 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+
+"""
+When using ray.tune with custom training scripts, you must periodically report
+training status back to Ray by calling status_reporter.report(result).
+
+Most of the fields are optional, the only required one is timesteps_total.
+
+In RLlib, the supplied algorithms fill in TrainingResult for you.
+"""
+
+
+TrainingResult = namedtuple("TrainingResult", [
+    # (Required) Accumulated timesteps for this entire experiment.
+    "timesteps_total",
+
+    # (Optional) Custom metadata to report for this iteration.
+    "info",
+
+    # (Optional) The mean episode reward if applicable.
+    "episode_reward_mean",
+
+    # (Optional) The mean episode length if applicable.
+    "episode_len_mean",
+
+    # (Optional) The current training accuracy if applicable>
+    "mean_accuracy",
+
+    # (Optional) The current training loss if applicable.
+    "mean_loss",
+
+    # (Auto-filled) The negated current training loss.
+    "neg_mean_loss",
+
+    # (Auto-filled) Unique string identifier for this experiment. This id is
+    # preserved across checkpoint / restore calls.
+    "experiment_id",
+
+    # (Auto-filled) The index of this training iteration, e.g. call to train().
+    "training_iteration",
+
+    # (Auto-filled) Number of timesteps in the simulator in this iteration.
+    "timesteps_this_iter",
+
+    # (Auto-filled) Time in seconds this iteration took to run.
+    "time_this_iter_s",
+
+    # (Auto-filled) Accumulated time in seconds for this entire experiment.
+    "time_total_s",
+
+    # (Auto-filled) The pid of the training process.
+    "pid",
+
+    # (Auto-filled) The hostname of the machine hosting the training process.
+    "hostname",
+])
+
+TrainingResult.__new__.__defaults__ = (None,) * len(TrainingResult._fields)
@@ -0,0 +1,162 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import os
+import sys
+import time
+import threading
+
+from ray.rllib.agent import Agent
+
+
+class StatusReporter(object):
+    """Object passed into your main() that you can report status through."""
+
+    def __init__(self):
+        self._latest_result = None
+        self._lock = threading.Lock()
+        self._error = None
+
+    def report(self, result):
+        """Report updated training status.
+
+        Args:
+            result (TrainingResult): Latest training result status. You must
+                at least define `timesteps_total`, but probably want to report
+                some of the other metrics as well.
+        """
+
+        with self._lock:
+            self._latest_result = result
+
+    def set_error(self, error):
+        """Report an error.
+
+        Args:
+            error (obj): Error object or string.
+        """
+
+        self._error = error
+
+    def _get_and_clear_status(self):
+        if self._error:
+            raise Exception("Error running script: " + str(self._error))
+        with self._lock:
+            res = self._latest_result
+            self._latest_result = None
+            return res
+
+    def _stop(self):
+        self._error = "Agent stopped"
+
+
+DEFAULT_CONFIG = {
+    # path of the script to run
+    "script_file_path": "/path/to/file.py",
+
+    # name of train function in the file, e.g. train(config, status_reporter)
+    "script_entrypoint": "train",
+
+    # batch results to at least this granularity
+    "script_min_iter_time_s": 5,
+}
+
+
+class _RunnerThread(threading.Thread):
+    """Supervisor thread that runs your script."""
+
+    def __init__(self, entrypoint, config, status_reporter):
+        self._entrypoint = entrypoint
+        self._entrypoint_args = [config, status_reporter]
+        self._status_reporter = status_reporter
+        threading.Thread.__init__(self)
+        self.daemon = True
+
+    def run(self):
+        try:
+            self._entrypoint(*self._entrypoint_args)
+        except Exception as e:
+            self._status_reporter.set_error(e)
+            raise e
+
+
+class ScriptRunner(Agent):
+    """Agent that runs a user script returning training results."""
+
+    _agent_name = "script"
+    _default_config = DEFAULT_CONFIG
+    _allow_unknown_configs = True
+
+    def _init(self):
+        # strong assumption here that we're in a new process
+        file_path = os.path.expanduser(self.config["script_file_path"])
+        sys.path.insert(0, os.path.dirname(file_path))
+        if hasattr(importlib, "util"):
+            # Python 3.4+
+            spec = importlib.util.spec_from_file_location(
+                "external_file", file_path)
+            external_file = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(external_file)
+        elif hasattr(importlib, "machinery"):
+            # Python 3.3
+            from importlib.machinery import SourceFileLoader
+            external_file = SourceFileLoader(
+                "external_file", file_path).load_module()
+        else:
+            # Python 2.x
+            import imp
+            external_file = imp.load_source("external_file", file_path)
+        if not external_file:
+            raise Exception(
+                "Unable to import file at {}".format(
+                    self.config["script_file_path"]))
+        entrypoint = getattr(external_file, self.config["script_entrypoint"])
+        self._status_reporter = StatusReporter()
+        self._runner = _RunnerThread(
+            entrypoint, self.config, self._status_reporter)
+        self._start_time = time.time()
+        self._last_reported_time = self._start_time
+        self._last_reported_timestep = 0
+        self._runner.start()
+
+    def train(self):
+        poll_start = time.time()
+        result = self._status_reporter._get_and_clear_status()
+        while result is None or \
+                time.time() - poll_start < \
+                self.config["script_min_iter_time_s"]:
+            time.sleep(1)
+            result = self._status_reporter._get_and_clear_status()
+
+        now = time.time()
+
+        # Include the negative loss to use as a stopping condition
+        if result.mean_loss is not None:
+            neg_loss = -result.mean_loss
+        else:
+            neg_loss = result.neg_mean_loss
+
+        result = result._replace(
+            experiment_id=self._experiment_id,
+            neg_mean_loss=neg_loss,
+            training_iteration=self.iteration,
+            time_this_iter_s=now - self._last_reported_time,
+            timesteps_this_iter=(
+                result.timesteps_total - self._last_reported_timestep),
+            time_total_s=now - self._start_time,
+            pid=os.getpid(),
+            hostname=os.uname()[1])
+
+        if result.timesteps_total:
+            self._last_reported_timestep = result.timesteps_total
+        self._last_reported_time = now
+        self._iteration += 1
+        self._log_result(result)
+
+        return result
+
+    def stop(self):
+        self._status_reporter._stop()
+        Agent.stop(self)
@@ -2,12 +2,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import sys
 import traceback
 import ray
+import os

 from collections import namedtuple
-from ray.rllib.agents import get_agent_class
+from ray.rllib.agent import get_agent_class


 # Ray resources required to schedule a Trial
@@ -24,15 +24,15 @@ class Trial(object):
    On error it transitions to ERROR, otherwise TERMINATED on success.
    """

-    PENDING = 'PENDING'
-    RUNNING = 'RUNNING'
-    TERMINATED = 'TERMINATED'
-    ERROR = 'ERROR'
+    PENDING = "PENDING"
+    RUNNING = "RUNNING"
+    TERMINATED = "TERMINATED"
+    ERROR = "ERROR"

    def __init__(
            self, env_creator, alg, config={}, local_dir='/tmp/ray',
            agent_id=None, resources=Resources(cpu=1, gpu=0),
-            stopping_criterion={}, checkpoint_freq=sys.maxsize,
+            stopping_criterion={}, checkpoint_freq=None,
            restore_path=None, upload_dir=None):
        """Initialize a new trial.

@@ -61,6 +61,7 @@ class Trial(object):
        self.checkpoint_path = None
        self.agent = None
        self.status = Trial.PENDING
+        self.location = None

    def start(self):
        """Starts this trial.
@@ -135,12 +136,33 @@ class Trial(object):

        if self.last_result is None:
            return self.status
-        return '{}, {} s, {} ts, {} itrs, {} rew'.format(
-            self.status,
-            int(self.last_result.time_total_s),
-            int(self.last_result.timesteps_total),
-            self.last_result.training_iteration,
-            round(self.last_result.episode_reward_mean, 1))
+
+        def location_string(hostname, pid):
+            if hostname == os.uname()[1]:
+                return 'pid={}'.format(pid)
+            else:
+                return '{} pid={}'.format(hostname, pid)
+
+        pieces = [
+            '{} [{}]'.format(
+                self.status, location_string(
+                    self.last_result.hostname, self.last_result.pid)),
+            '{} s'.format(int(self.last_result.time_total_s)),
+            '{} ts'.format(int(self.last_result.timesteps_total))]
+
+        if self.last_result.episode_reward_mean is not None:
+            pieces.append('{} rew'.format(
+                format(self.last_result.episode_reward_mean, '.3g')))
+
+        if self.last_result.mean_loss is not None:
+            pieces.append('{} loss'.format(
+                format(self.last_result.mean_loss, '.3g')))
+
+        if self.last_result.mean_accuracy is not None:
+            pieces.append('{} acc'.format(
+                format(self.last_result.mean_accuracy, '.3g')))
+
+        return ', '.join(pieces)

    def checkpoint(self):
        """Synchronously checkpoints the state of this trial.
@@ -63,7 +63,7 @@ class TrialRunner(object):
                if trial.status == Trial.PENDING:
                    assert self._has_resources(trial.resources), \
                        ("Insufficient cluster resources to launch trial",
-                         trial.resources)
+                         (trial.resources, self._avail_resources))
            assert False, "Called step when all trials finished?"

    def get_trials(self):
@@ -87,9 +87,11 @@ class TrialRunner(object):

        messages = ["== Status =="]
        messages.append(
-            "Available: {}".format(self._avail_resources))
-        messages.append(
-            "Committed: {}".format(self._committed_resources))
+            "Resources used: {}/{} CPUs, {}/{} GPUs".format(
+                self._committed_resources.cpu,
+                self._avail_resources.cpu,
+                self._committed_resources.gpu,
+                self._avail_resources.gpu))
        for local_dir in sorted(set([t.local_dir for t in self._trials])):
            messages.append("Tensorboard logdir: {}".format(local_dir))
            for t in self._trials:
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+
+"""Command-line tool for tuning hyperparameters with Ray.
+
+MNIST tuning example:
+    ./tune.py -f examples/tune_mnist_ray.yaml
+"""
+
+from ray.rllib import train
+import sys
+
+# TODO(ekl) right now this is a thin wrapper around the rllib training script,
+# however in the future we should have a separate command line tool here.
+train.main(sys.argv[1:] + ['--alg=script'])