diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst index 4f8a4c66a..a5c10d440 100644 --- a/doc/source/rllib-env.rst +++ b/doc/source/rllib-env.rst @@ -24,18 +24,21 @@ ARS **Yes** **Yes** No No .. _`+parametric`: rllib-models.html#variable-length-parametric-action-spaces -You can pass either a string name or a Python class to specify an environment. By default, strings will be interpreted as a gym `environment name `__. Custom env classes must take a single ``env_config`` parameter in their constructor: +You can pass either a string name or a Python class to specify an environment. By default, strings will be interpreted as a gym `environment name `__. Custom env classes passed directly to the agent must take a single ``env_config`` parameter in their constructor: .. code-block:: python - import ray + import gym, ray from ray.rllib.agents import ppo class MyEnv(gym.Env): def __init__(self, env_config): - self.action_space = ... - self.observation_space = ... - ... + self.action_space = + self.observation_space = + def reset(self): + return + def step(self, action): + return , , , ray.init() trainer = ppo.PPOAgent(env=MyEnv, config={ diff --git a/python/ray/rllib/evaluation/sampler.py b/python/ray/rllib/evaluation/sampler.py index ac7c6ed8a..459d19296 100644 --- a/python/ray/rllib/evaluation/sampler.py +++ b/python/ray/rllib/evaluation/sampler.py @@ -287,12 +287,12 @@ def _env_runner(async_vector_env, # Do batched policy eval eval_results = _do_policy_eval(tf_sess, to_eval, policies, - active_episodes, clip_actions) + active_episodes) # Process results and update episode state actions_to_send = _process_policy_eval_results( to_eval, eval_results, active_episodes, active_envs, - off_policy_actions) + off_policy_actions, policies, clip_actions) # Return computed actions to ready envs. We also send to envs that have # taken off-policy actions; those envs are free to ignore the action. @@ -448,7 +448,7 @@ def _process_observations(async_vector_env, policies, batch_builder_pool, return active_envs, to_eval, outputs -def _do_policy_eval(tf_sess, to_eval, policies, active_episodes, clip_actions): +def _do_policy_eval(tf_sess, to_eval, policies, active_episodes): """Call compute actions on observation batches to get next actions. Returns: @@ -483,18 +483,12 @@ def _do_policy_eval(tf_sess, to_eval, policies, active_episodes, clip_actions): for k, v in pending_fetches.items(): eval_results[k] = builder.get(v) - if clip_actions: - for policy_id, results in eval_results.items(): - policy = _get_or_raise(policies, policy_id) - actions, rnn_out_cols, pi_info_cols = results - eval_results[policy_id] = (_clip_actions( - actions, policy.action_space), rnn_out_cols, pi_info_cols) - return eval_results def _process_policy_eval_results(to_eval, eval_results, active_episodes, - active_envs, off_policy_actions): + active_envs, off_policy_actions, policies, + clip_actions): """Process the output of policy neural network evaluation. Records policy evaluation results into the given episode objects and @@ -521,10 +515,15 @@ def _process_policy_eval_results(to_eval, eval_results, active_episodes, pi_info_cols["state_out_{}".format(f_i)] = column # Save output rows actions = _unbatch_tuple_actions(actions) + policy = _get_or_raise(policies, policy_id) for i, action in enumerate(actions): env_id = eval_data[i].env_id agent_id = eval_data[i].agent_id - actions_to_send[env_id][agent_id] = action + if clip_actions: + actions_to_send[env_id][agent_id] = _clip_actions( + action, policy.action_space) + else: + actions_to_send[env_id][agent_id] = action episode = active_episodes[env_id] episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols]) episode._set_last_pi_info( @@ -562,7 +561,7 @@ def _clip_actions(actions, space): """Called to clip actions to the specified range of this policy. Arguments: - actions: Batch of actions or TupleActions. + actions: Single action. space: Action space the actions should be present in. Returns: @@ -572,13 +571,13 @@ def _clip_actions(actions, space): if isinstance(space, gym.spaces.Box): return np.clip(actions, space.low, space.high) elif isinstance(space, gym.spaces.Tuple): - if not isinstance(actions, TupleActions): + if type(actions) not in (tuple, list): raise ValueError("Expected tuple space for actions {}: {}".format( actions, space)) out = [] - for a, s in zip(actions.batches, space.spaces): + for a, s in zip(actions, space.spaces): out.append(_clip_actions(a, s)) - return TupleActions(out) + return out else: return actions diff --git a/python/ray/rllib/test/multiagent_pendulum.py b/python/ray/rllib/test/multiagent_pendulum.py new file mode 100644 index 000000000..c4ee5ce76 --- /dev/null +++ b/python/ray/rllib/test/multiagent_pendulum.py @@ -0,0 +1,42 @@ +"""Integration test: (1) pendulum works, (2) single-agent multi-agent works.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import ray +from ray.rllib.test.test_multi_agent_env import make_multiagent +from ray.tune import run_experiments +from ray.tune.registry import register_env + +if __name__ == "__main__": + ray.init() + MultiPendulum = make_multiagent("Pendulum-v0") + register_env("multi_pend", lambda _: MultiPendulum(1)) + trials = run_experiments({ + "test": { + "run": "PPO", + "env": "multi_pend", + "stop": { + "timesteps_total": 500000, + "episode_reward_mean": -200, + }, + "config": { + "train_batch_size": 2048, + "vf_clip_param": 10.0, + "num_workers": 0, + "num_envs_per_worker": 10, + "lambda": 0.1, + "gamma": 0.95, + "lr": 0.0003, + "sgd_minibatch_size": 64, + "num_sgd_iter": 10, + "model": { + "fcnet_hiddens": [64, 64], + }, + "batch_mode": "complete_episodes", + }, + } + }) + if trials[0].last_result["episode_reward_mean"] < -200: + raise ValueError("Did not get to -200 reward", trials[0].last_result) diff --git a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml index b8c0293a3..3e9d45179 100644 --- a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml +++ b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml @@ -5,7 +5,8 @@ pendulum-ppo: config: train_batch_size: 2048 vf_clip_param: 10.0 - num_workers: 2 + num_workers: 0 + num_envs_per_worker: 10 lambda: 0.1 gamma: 0.95 lr: 0.0003 diff --git a/test/jenkins_tests/run_multi_node_tests.sh b/test/jenkins_tests/run_multi_node_tests.sh index 9fba8a000..908bc438e 100755 --- a/test/jenkins_tests/run_multi_node_tests.sh +++ b/test/jenkins_tests/run_multi_node_tests.sh @@ -299,6 +299,14 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/test/test_rollout.sh +# Try a couple times since it's stochastic +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/rllib/test/multiagent_pendulum.py || \ + docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/rllib/test/multiagent_pendulum.py || \ + docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/rllib/test/multiagent_pendulum.py + docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/tune_mnist_ray.py \ --smoke-test