From d01dc9e22d5e8625ae6ac49e2e689eebf472b5f8 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 19 Jul 2018 15:30:36 -0700 Subject: [PATCH] [rllib] format with yapf (#2427) * initial yapf * manual fix yapf bugs --- .travis/yapf.sh | 1 - python/ray/rllib/__init__.py | 17 ++- python/ray/rllib/agents/a3c/a3c.py | 10 +- .../rllib/agents/a3c/a3c_tf_policy_graph.py | 44 +++--- .../agents/a3c/a3c_torch_policy_graph.py | 21 +-- python/ray/rllib/agents/agent.py | 61 +++++---- python/ray/rllib/agents/bc/bc.py | 21 +-- python/ray/rllib/agents/bc/bc_evaluator.py | 13 +- .../ray/rllib/agents/bc/experience_dataset.py | 5 +- python/ray/rllib/agents/bc/policy.py | 37 ++--- python/ray/rllib/agents/ddpg/apex.py | 13 +- python/ray/rllib/agents/ddpg/ddpg.py | 6 +- .../rllib/agents/ddpg/ddpg_policy_graph.py | 79 ++++++----- python/ray/rllib/agents/dqn/apex.py | 13 +- python/ray/rllib/agents/dqn/dqn.py | 48 +++---- .../ray/rllib/agents/dqn/dqn_policy_graph.py | 112 ++++++++------- python/ray/rllib/agents/es/es.py | 68 +++++---- python/ray/rllib/agents/es/optimizers.py | 4 +- python/ray/rllib/agents/es/policies.py | 26 ++-- python/ray/rllib/agents/es/tabular_logger.py | 16 ++- python/ray/rllib/agents/es/utils.py | 14 +- python/ray/rllib/agents/pg/pg.py | 7 +- python/ray/rllib/agents/pg/pg_policy_graph.py | 12 +- python/ray/rllib/agents/ppo/ppo.py | 31 +++-- .../ray/rllib/agents/ppo/ppo_policy_graph.py | 105 +++++++++----- python/ray/rllib/agents/ppo/test/test.py | 8 +- python/ray/rllib/agents/ppo/utils.py | 2 +- python/ray/rllib/env/async_vector_env.py | 15 +- python/ray/rllib/env/atari_wrappers.py | 5 +- python/ray/rllib/env/vector_env.py | 3 +- python/ray/rllib/evaluation/metrics.py | 7 +- .../ray/rllib/evaluation/policy_evaluator.py | 98 +++++++------ python/ray/rllib/evaluation/sample_batch.py | 7 +- python/ray/rllib/evaluation/sampler.py | 80 ++++++----- .../ray/rllib/evaluation/tf_policy_graph.py | 59 ++++---- .../rllib/evaluation/torch_policy_graph.py | 8 +- .../multiagent_mountaincar.py | 23 ++-- .../multiagent_mountaincar_env.py | 5 +- .../legacy_multiagent/multiagent_pendulum.py | 23 ++-- .../multiagent_pendulum_env.py | 30 ++-- .../ray/rllib/examples/multiagent_cartpole.py | 6 +- .../rllib/examples/serving/cartpole_client.py | 10 +- .../rllib/examples/serving/cartpole_server.py | 27 ++-- python/ray/rllib/models/__init__.py | 8 +- python/ray/rllib/models/action_dist.py | 56 ++++---- python/ray/rllib/models/catalog.py | 71 +++++----- python/ray/rllib/models/fcnet.py | 9 +- python/ray/rllib/models/lstm.py | 27 ++-- python/ray/rllib/models/misc.py | 41 ++++-- python/ray/rllib/models/model.py | 14 +- python/ray/rllib/models/multiagentfcnet.py | 6 +- python/ray/rllib/models/preprocessors.py | 11 +- python/ray/rllib/models/visionnet.py | 25 +++- python/ray/rllib/optimizers/__init__.py | 1 - .../optimizers/async_gradients_optimizer.py | 12 +- .../optimizers/async_samples_optimizer.py | 94 +++++++------ python/ray/rllib/optimizers/multi_gpu_impl.py | 41 +++--- .../rllib/optimizers/multi_gpu_optimizer.py | 42 +++--- .../ray/rllib/optimizers/policy_optimizer.py | 7 +- python/ray/rllib/optimizers/replay_buffer.py | 24 ++-- python/ray/rllib/optimizers/segment_tree.py | 16 +-- .../rllib/optimizers/sync_replay_optimizer.py | 74 ++++++---- .../optimizers/sync_samples_optimizer.py | 17 ++- python/ray/rllib/rollout.py | 28 ++-- python/ray/rllib/scripts.py | 1 - python/ray/rllib/test/mock_evaluator.py | 9 +- python/ray/rllib/test/test_catalog.py | 10 +- .../ray/rllib/test/test_checkpoint_restore.py | 22 ++- python/ray/rllib/test/test_filters.py | 16 ++- python/ray/rllib/test/test_lstm.py | 21 +-- python/ray/rllib/test/test_multi_agent_env.py | 129 ++++++++++++------ python/ray/rllib/test/test_optimizers.py | 4 +- .../ray/rllib/test/test_policy_evaluator.py | 23 ++-- python/ray/rllib/test/test_serving_env.py | 14 +- .../ray/rllib/test/test_supported_spaces.py | 67 ++++----- python/ray/rllib/train.py | 35 +++-- .../generate_regression_tests.py | 2 - .../regression_tests/regression_test.py | 1 - .../tuned_examples/run_regression_tests.py | 4 +- python/ray/rllib/utils/compression.py | 7 +- python/ray/rllib/utils/filter.py | 13 +- python/ray/rllib/utils/policy_server.py | 12 +- python/ray/rllib/utils/reshaper.py | 15 +- python/ray/rllib/utils/schedules.py | 8 +- python/ray/rllib/utils/tf_run_builder.py | 13 +- python/ray/rllib/utils/window_stat.py | 4 +- 86 files changed, 1276 insertions(+), 978 deletions(-) diff --git a/.travis/yapf.sh b/.travis/yapf.sh index 7c12ce4b4..75fed3efe 100755 --- a/.travis/yapf.sh +++ b/.travis/yapf.sh @@ -24,7 +24,6 @@ YAPF_FLAGS=( ) YAPF_EXCLUDES=( - '--exclude' 'python/ray/rllib/*' '--exclude' 'python/ray/cloudpickle/*' '--exclude' 'python/build/*' '--exclude' 'python/ray/pyarrow_files/*' diff --git a/python/ray/rllib/__init__.py b/python/ray/rllib/__init__.py index 609acdd0f..cf0f10580 100644 --- a/python/ray/rllib/__init__.py +++ b/python/ray/rllib/__init__.py @@ -17,9 +17,10 @@ from ray.rllib.evaluation.sample_batch import SampleBatch def _register_all(): - for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG", - "APEX_DDPG", "__fake", "__sigmoid_fake_data", - "__parameter_tuning"]: + for key in [ + "PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG", "APEX_DDPG", + "__fake", "__sigmoid_fake_data", "__parameter_tuning" + ]: from ray.rllib.agents.agent import get_agent_class register_trainable(key, get_agent_class(key)) @@ -27,6 +28,12 @@ def _register_all(): _register_all() __all__ = [ - "PolicyGraph", "TFPolicyGraph", "PolicyEvaluator", "SampleBatch", - "AsyncVectorEnv", "MultiAgentEnv", "VectorEnv", "ServingEnv", + "PolicyGraph", + "TFPolicyGraph", + "PolicyEvaluator", + "SampleBatch", + "AsyncVectorEnv", + "MultiAgentEnv", + "VectorEnv", + "ServingEnv", ] diff --git a/python/ray/rllib/agents/a3c/a3c.py b/python/ray/rllib/agents/a3c/a3c.py index 7326685aa..0a739474f 100644 --- a/python/ray/rllib/agents/a3c/a3c.py +++ b/python/ray/rllib/agents/a3c/a3c.py @@ -92,15 +92,15 @@ class A3CAgent(Agent): self.remote_evaluators = self.make_remote_evaluators( self.env_creator, policy_cls, self.config["num_workers"], {"num_gpus": 1 if self.config["use_gpu_for_workers"] else 0}) - self.optimizer = AsyncGradientsOptimizer( - self.local_evaluator, self.remote_evaluators, - self.config["optimizer"]) + self.optimizer = AsyncGradientsOptimizer(self.local_evaluator, + self.remote_evaluators, + self.config["optimizer"]) def _train(self): prev_steps = self.optimizer.num_steps_sampled self.optimizer.step() - FilterManager.synchronize( - self.local_evaluator.filters, self.remote_evaluators) + FilterManager.synchronize(self.local_evaluator.filters, + self.remote_evaluators) result = self.optimizer.collect_metrics() result = result._replace( timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py index faf22f602..00f630d3b 100644 --- a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py @@ -14,19 +14,23 @@ from ray.rllib.models.catalog import ModelCatalog class A3CLoss(object): - def __init__( - self, action_dist, actions, advantages, v_target, vf, - vf_loss_coeff=0.5, entropy_coeff=-0.01): + def __init__(self, + action_dist, + actions, + advantages, + v_target, + vf, + vf_loss_coeff=0.5, + entropy_coeff=-0.01): log_prob = action_dist.logp(actions) # The "policy gradients" loss - self.pi_loss = - tf.reduce_sum(log_prob * advantages) + self.pi_loss = -tf.reduce_sum(log_prob * advantages) delta = vf - v_target self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta)) self.entropy = tf.reduce_sum(action_dist.entropy()) - self.total_loss = (self.pi_loss + - self.vf_loss * vf_loss_coeff + + self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff + self.entropy * entropy_coeff) @@ -41,8 +45,8 @@ class A3CPolicyGraph(TFPolicyGraph): tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) - self.model = ModelCatalog.get_model( - self.observations, logit_dim, self.config["model"]) + self.model = ModelCatalog.get_model(self.observations, logit_dim, + self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), @@ -62,9 +66,9 @@ class A3CPolicyGraph(TFPolicyGraph): action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") v_target = tf.placeholder(tf.float32, [None], name="v_target") - self.loss = A3CLoss( - action_dist, actions, advantages, v_target, self.vf, - self.config["vf_loss_coeff"], self.config["entropy_coeff"]) + self.loss = A3CLoss(action_dist, actions, advantages, v_target, + self.vf, self.config["vf_loss_coeff"], + self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ @@ -76,10 +80,16 @@ class A3CPolicyGraph(TFPolicyGraph): self.state_in = self.model.state_in self.state_out = self.model.state_out TFPolicyGraph.__init__( - self, observation_space, action_space, self.sess, - obs_input=self.observations, action_sampler=action_dist.sample(), - loss=self.loss.total_loss, loss_inputs=loss_in, - state_inputs=self.state_in, state_outputs=self.state_out, + self, + observation_space, + action_space, + self.sess, + obs_input=self.observations, + action_sampler=action_dist.sample(), + loss=self.loss.total_loss, + loss_inputs=loss_in, + state_inputs=self.state_in, + state_outputs=self.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) @@ -132,5 +142,5 @@ class A3CPolicyGraph(TFPolicyGraph): for i in range(len(self.state_in)): next_state.append([sample_batch["state_out_{}".format(i)][-1]]) last_r = self.value(sample_batch["new_obs"][-1], *next_state) - return compute_advantages( - sample_batch, last_r, self.config["gamma"], self.config["lambda"]) + return compute_advantages(sample_batch, last_r, self.config["gamma"], + self.config["lambda"]) diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py index a277de945..dcdada591 100644 --- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py @@ -46,20 +46,21 @@ class A3CTorchPolicyGraph(TorchPolicyGraph): action_space, self.config["model"]) self.model = ModelCatalog.get_torch_model( obs_space.shape, self.logit_dim, self.config["model"]) - loss = A3CLoss( - self.model, self.config["vf_loss_coeff"], - self.config["entropy_coeff"]) + loss = A3CLoss(self.model, self.config["vf_loss_coeff"], + self.config["entropy_coeff"]) TorchPolicyGraph.__init__( - self, obs_space, action_space, self.model, loss, - loss_inputs=[ - "obs", "actions", "advantages", "value_targets"]) + self, + obs_space, + action_space, + self.model, + loss, + loss_inputs=["obs", "actions", "advantages", "value_targets"]) def extra_action_out(self, model_out): return {"vf_preds": var_to_np(model_out[1])} def optimizer(self): - return torch.optim.Adam( - self.model.parameters(), lr=self.config["lr"]) + return torch.optim.Adam(self.model.parameters(), lr=self.config["lr"]) def postprocess_trajectory(self, sample_batch, other_agent_batches=None): completed = sample_batch["dones"][-1] @@ -67,8 +68,8 @@ class A3CTorchPolicyGraph(TorchPolicyGraph): last_r = 0.0 else: last_r = self._value(sample_batch["new_obs"][-1]) - return compute_advantages( - sample_batch, last_r, self.config["gamma"], self.config["lambda"]) + return compute_advantages(sample_batch, last_r, self.config["gamma"], + self.config["lambda"]) def _value(self, obs): with self.lock: diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py index f53923030..9cd661f3a 100644 --- a/python/ray/rllib/agents/agent.py +++ b/python/ray/rllib/agents/agent.py @@ -47,7 +47,9 @@ COMMON_CONFIG = { "allow_growth": True, }, "log_device_placement": False, - "device_count": {"CPU": 1}, + "device_count": { + "CPU": 1 + }, "allow_soft_placement": True, # required by PPO multi-gpu }, # Whether to LZ4 compress observations @@ -86,8 +88,7 @@ def _deep_update(original, new_dict, new_keys_allowed, whitelist): for k, value in new_dict.items(): if k not in original and k != "env": if not new_keys_allowed: - raise Exception( - "Unknown config parameter `{}` ".format(k)) + raise Exception("Unknown config parameter `{}` ".format(k)) if type(original.get(k)) is dict: if k in whitelist: _deep_update(original[k], value, True, []) @@ -112,22 +113,24 @@ class Agent(Trainable): _allow_unknown_configs = False _allow_unknown_subkeys = [ - "tf_session_args", "env_config", "model", "optimizer", "multiagent"] + "tf_session_args", "env_config", "model", "optimizer", "multiagent" + ] def make_local_evaluator(self, env_creator, policy_graph): """Convenience method to return configured local evaluator.""" - return self._make_evaluator( - PolicyEvaluator, env_creator, policy_graph, 0) + return self._make_evaluator(PolicyEvaluator, env_creator, policy_graph, + 0) - def make_remote_evaluators( - self, env_creator, policy_graph, count, remote_args): + def make_remote_evaluators(self, env_creator, policy_graph, count, + remote_args): """Convenience method to return a number of remote evaluators.""" cls = PolicyEvaluator.as_remote(**remote_args).remote return [ - self._make_evaluator(cls, env_creator, policy_graph, i+1) - for i in range(count)] + self._make_evaluator(cls, env_creator, policy_graph, i + 1) + for i in range(count) + ] def _make_evaluator(self, cls, env_creator, policy_graph, worker_index): config = self.config @@ -140,8 +143,8 @@ class Agent(Trainable): env_creator, self.config["multiagent"]["policy_graphs"] or policy_graph, policy_mapping_fn=self.config["multiagent"]["policy_mapping_fn"], - tf_session_creator=( - session_creator if config["tf_session_args"] else None), + tf_session_creator=(session_creator + if config["tf_session_args"] else None), batch_steps=config["sample_batch_size"], batch_mode=config["batch_mode"], episode_horizon=config["horizon"], @@ -157,14 +160,12 @@ class Agent(Trainable): @classmethod def resource_help(cls, config): - return ( - "\n\nYou can adjust the resource requests of RLlib agents by " - "setting `num_workers` and other configs. See the " - "DEFAULT_CONFIG defined by each agent for more info.\n\n" - "The config of this agent is: " + json.dumps(config)) + return ("\n\nYou can adjust the resource requests of RLlib agents by " + "setting `num_workers` and other configs. See the " + "DEFAULT_CONFIG defined by each agent for more info.\n\n" + "The config of this agent is: " + json.dumps(config)) - def __init__( - self, config=None, env=None, logger_creator=None): + def __init__(self, config=None, env=None, logger_creator=None): """Initialize an RLLib agent. Args: @@ -235,8 +236,8 @@ class Agent(Trainable): obs = self.local_evaluator.filters["default"]( observation, update=False) return self.local_evaluator.for_policy( - lambda p: p.compute_single_action( - obs, state, is_training=False)[0]) + lambda p: p.compute_single_action(obs, state, is_training=False)[0] + ) class _MockAgent(Agent): @@ -257,8 +258,10 @@ class _MockAgent(Agent): and (self.config["persistent_error"] or not self.restored): raise Exception("mock error") return TrainingResult( - episode_reward_mean=10, episode_len_mean=10, - timesteps_this_iter=10, info={}) + episode_reward_mean=10, + episode_len_mean=10, + timesteps_this_iter=10, + info={}) def _save(self, checkpoint_dir): path = os.path.join(checkpoint_dir, "mock_agent.pkl") @@ -299,9 +302,11 @@ class _SigmoidFakeData(_MockAgent): v = np.tanh(float(i) / self.config["width"]) v *= self.config["height"] return TrainingResult( - episode_reward_mean=v, episode_len_mean=v, + episode_reward_mean=v, + episode_len_mean=v, timesteps_this_iter=self.config["iter_timesteps"], - time_this_iter_s=self.config["iter_time"], info={}) + time_this_iter_s=self.config["iter_time"], + info={}) class _ParameterTuningAgent(_MockAgent): @@ -320,7 +325,8 @@ class _ParameterTuningAgent(_MockAgent): episode_reward_mean=self.config["reward_amt"] * self.iteration, episode_len_mean=self.config["reward_amt"], timesteps_this_iter=self.config["iter_timesteps"], - time_this_iter_s=self.config["iter_time"], info={}) + time_this_iter_s=self.config["iter_time"], + info={}) def get_agent_class(alg): @@ -363,5 +369,4 @@ def get_agent_class(alg): elif alg == "__parameter_tuning": return _ParameterTuningAgent else: - raise Exception( - ("Unknown algorithm {}.").format(alg)) + raise Exception(("Unknown algorithm {}.").format(alg)) diff --git a/python/ray/rllib/agents/bc/bc.py b/python/ray/rllib/agents/bc/bc.py index 1484a5dbe..2bb3792b8 100644 --- a/python/ray/rllib/agents/bc/bc.py +++ b/python/ray/rllib/agents/bc/bc.py @@ -57,28 +57,31 @@ class BCAgent(Agent): else: num_gpus_per_worker = 0 return Resources( - cpu=1, gpu=cf["gpu"] and 1 or 0, + cpu=1, + gpu=cf["gpu"] and 1 or 0, extra_cpu=cf["num_workers"], extra_gpu=num_gpus_per_worker * cf["num_workers"]) def _init(self): - self.local_evaluator = BCEvaluator( - self.env_creator, self.config, self.logdir) + self.local_evaluator = BCEvaluator(self.env_creator, self.config, + self.logdir) if self.config["use_gpu_for_workers"]: remote_cls = GPURemoteBCEvaluator else: remote_cls = RemoteBCEvaluator self.remote_evaluators = [ remote_cls.remote(self.env_creator, self.config, self.logdir) - for _ in range(self.config["num_workers"])] - self.optimizer = AsyncGradientsOptimizer( - self.local_evaluator, self.remote_evaluators, - self.config["optimizer"]) + for _ in range(self.config["num_workers"]) + ] + self.optimizer = AsyncGradientsOptimizer(self.local_evaluator, + self.remote_evaluators, + self.config["optimizer"]) def _train(self): self.optimizer.step() - metric_lists = [re.get_metrics.remote() for re in - self.remote_evaluators] + metric_lists = [ + re.get_metrics.remote() for re in self.remote_evaluators + ] total_samples = 0 total_loss = 0 for metrics in metric_lists: diff --git a/python/ray/rllib/agents/bc/bc_evaluator.py b/python/ray/rllib/agents/bc/bc_evaluator.py index e896b1f88..4726b4a3c 100644 --- a/python/ray/rllib/agents/bc/bc_evaluator.py +++ b/python/ray/rllib/agents/bc/bc_evaluator.py @@ -14,8 +14,8 @@ from ray.rllib.models import ModelCatalog class BCEvaluator(EvaluatorInterface): def __init__(self, env_creator, config, logdir): - env = ModelCatalog.get_preprocessor_as_wrapper(env_creator( - config["env_config"]), config["model"]) + env = ModelCatalog.get_preprocessor_as_wrapper( + env_creator(config["env_config"]), config["model"]) self.dataset = ExperienceDataset(config["dataset_path"]) self.policy = BCPolicy(env.observation_space, env.action_space, config) self.config = config @@ -27,8 +27,10 @@ class BCEvaluator(EvaluatorInterface): def compute_gradients(self, samples): gradient, info = self.policy.compute_gradients(samples) - self.metrics_queue.put( - {"num_samples": info["num_samples"], "loss": info["loss"]}) + self.metrics_queue.put({ + "num_samples": info["num_samples"], + "loss": info["loss"] + }) return gradient, {} def apply_gradients(self, grads): @@ -42,8 +44,7 @@ class BCEvaluator(EvaluatorInterface): def save(self): weights = self.get_weights() - return pickle.dumps({ - "weights": weights}) + return pickle.dumps({"weights": weights}) def restore(self, objs): objs = pickle.loads(objs) diff --git a/python/ray/rllib/agents/bc/experience_dataset.py b/python/ray/rllib/agents/bc/experience_dataset.py index ccf47bc31..d08284184 100644 --- a/python/ray/rllib/agents/bc/experience_dataset.py +++ b/python/ray/rllib/agents/bc/experience_dataset.py @@ -21,8 +21,9 @@ class ExperienceDataset(object): elements. The file must be available on each machine used by a BCEvaluator. """ - self._dataset = list(itertools.chain.from_iterable( - pickle.load(open(dataset_path, "rb")))) + self._dataset = list( + itertools.chain.from_iterable( + pickle.load(open(dataset_path, "rb")))) def sample(self, batch_size): indexes = np.random.choice(len(self._dataset), batch_size) diff --git a/python/ray/rllib/agents/bc/policy.py b/python/ray/rllib/agents/bc/policy.py index e3077dd3d..a504e3ec6 100644 --- a/python/ray/rllib/agents/bc/policy.py +++ b/python/ray/rllib/agents/bc/policy.py @@ -23,8 +23,8 @@ class BCPolicy(object): self.x = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) dist_class, self.logit_dim = ModelCatalog.get_action_dist( ac_space, self.config["model"]) - self._model = ModelCatalog.get_model( - self.x, self.logit_dim, self.config["model"]) + self._model = ModelCatalog.get_model(self.x, self.logit_dim, + self.config["model"]) self.logits = self._model.outputs self.curr_dist = dist_class(self.logits) self.sample = self.curr_dist.sample() @@ -33,17 +33,16 @@ class BCPolicy(object): def setup_loss(self, action_space): if isinstance(action_space, gym.spaces.Box): - self.ac = tf.placeholder(tf.float32, - [None] + list(action_space.shape), - name="ac") + self.ac = tf.placeholder( + tf.float32, [None] + list(action_space.shape), name="ac") elif isinstance(action_space, gym.spaces.Discrete): self.ac = tf.placeholder(tf.int64, [None], name="ac") else: - raise NotImplementedError( - "action space" + str(type(action_space)) + - "currently not supported") + raise NotImplementedError("action space" + + str(type(action_space)) + + "currently not supported") log_prob = self.curr_dist.logp(self.ac) - self.pi_loss = - tf.reduce_sum(log_prob) + self.pi_loss = -tf.reduce_sum(log_prob) self.loss = self.pi_loss def setup_gradients(self): @@ -62,11 +61,14 @@ class BCPolicy(object): self.summary_op = tf.summary.merge_all() # TODO(rliaw): Can consider exposing these parameters - self.sess = tf.Session(graph=self.g, config=tf.ConfigProto( - intra_op_parallelism_threads=1, inter_op_parallelism_threads=2, - gpu_options=tf.GPUOptions(allow_growth=True))) - self.variables = ray.experimental.TensorFlowVariables(self.loss, - self.sess) + self.sess = tf.Session( + graph=self.g, + config=tf.ConfigProto( + intra_op_parallelism_threads=1, + inter_op_parallelism_threads=2, + gpu_options=tf.GPUOptions(allow_growth=True))) + self.variables = ray.experimental.TensorFlowVariables( + self.loss, self.sess) self.sess.run(tf.global_variables_initializer()) def compute_gradients(self, samples): @@ -82,15 +84,14 @@ class BCPolicy(object): [self.loss, self.grads, self.summary_op], feed_dict=feed_dict) info["summary"] = summ else: - loss, grad = self.sess.run([self.loss, self.grads], - feed_dict=feed_dict) + loss, grad = self.sess.run( + [self.loss, self.grads], feed_dict=feed_dict) info["num_samples"] = len(samples) info["loss"] = loss return grad, info def apply_gradients(self, grads): - feed_dict = {self.grads[i]: grads[i] - for i in range(len(grads))} + feed_dict = {self.grads[i]: grads[i] for i in range(len(grads))} self.sess.run(self._apply_gradients, feed_dict=feed_dict) def get_weights(self): diff --git a/python/ray/rllib/agents/ddpg/apex.py b/python/ray/rllib/agents/ddpg/apex.py index b53d4178e..b35f1ea35 100644 --- a/python/ray/rllib/agents/ddpg/apex.py +++ b/python/ray/rllib/agents/ddpg/apex.py @@ -9,13 +9,12 @@ APEX_DDPG_DEFAULT_CONFIG = merge_dicts( DDPG_CONFIG, { "optimizer_class": "AsyncSamplesOptimizer", - "optimizer": - merge_dicts( - DDPG_CONFIG["optimizer"], { - "max_weight_sync_delay": 400, - "num_replay_buffer_shards": 4, - "debug": False - }), + "optimizer": merge_dicts( + DDPG_CONFIG["optimizer"], { + "max_weight_sync_delay": 400, + "num_replay_buffer_shards": 4, + "debug": False + }), "n_step": 3, "num_workers": 32, "buffer_size": 2000000, diff --git a/python/ray/rllib/agents/ddpg/ddpg.py b/python/ray/rllib/agents/ddpg/ddpg.py index c7e45f1b3..95b6859d2 100644 --- a/python/ray/rllib/agents/ddpg/ddpg.py +++ b/python/ray/rllib/agents/ddpg/ddpg.py @@ -118,9 +118,9 @@ class DDPGAgent(DQNAgent): if self.config["per_worker_exploration"]: assert self.config["num_workers"] > 1, \ "This requires multiple workers" - return ConstantSchedule( - self.config["noise_scale"] * 0.4 ** - (1 + worker_index / float(self.config["num_workers"] - 1) * 7)) + exponent = ( + 1 + worker_index / float(self.config["num_workers"] - 1) * 7) + return ConstantSchedule(self.config["noise_scale"] * 0.4**exponent) else: return LinearSchedule( schedule_timesteps=int(self.config["exploration_fraction"] * diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py index 1dd8941b9..ceae0d0f0 100644 --- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py +++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py @@ -14,7 +14,6 @@ from ray.rllib.models import ModelCatalog from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph - A_SCOPE = "a_func" P_SCOPE = "p_func" P_TARGET_SCOPE = "target_p_func" @@ -26,8 +25,8 @@ class PNetwork(object): """Maps an observations (i.e., state) to an action where each entry takes value from (0, 1) due to the sigmoid function.""" - def __init__( - self, model, dim_actions, hiddens=[64, 64], activation="relu"): + def __init__(self, model, dim_actions, hiddens=[64, 64], + activation="relu"): action_out = model.last_layer activation = tf.nn.__dict__[activation] for hidden in hiddens: @@ -44,9 +43,14 @@ class ActionNetwork(object): for training, thus ignoring the batch_size issue when constructing a stochastic action.""" - def __init__( - self, p_values, low_action, high_action, stochastic, eps, - theta=0.15, sigma=0.2): + def __init__(self, + p_values, + low_action, + high_action, + stochastic, + eps, + theta=0.15, + sigma=0.2): # shape is [None, dim_action] deterministic_actions = ( @@ -65,15 +69,16 @@ class ActionNetwork(object): stochastic_actions = deterministic_actions + eps * ( high_action - low_action) * exploration_value - self.actions = tf.cond( - stochastic, lambda: stochastic_actions, - lambda: deterministic_actions) + self.actions = tf.cond(stochastic, lambda: stochastic_actions, + lambda: deterministic_actions) class QNetwork(object): - def __init__( - self, model, action_inputs, - hiddens=[64, 64], activation="relu"): + def __init__(self, + model, + action_inputs, + hiddens=[64, 64], + activation="relu"): q_out = tf.concat([model.last_layer, action_inputs], axis=1) activation = tf.nn.__dict__[activation] for hidden in hiddens: @@ -84,14 +89,21 @@ class QNetwork(object): class ActorCriticLoss(object): - def __init__( - self, q_t, q_tp1, q_tp0, importance_weights, rewards, done_mask, - gamma=0.99, n_step=1, use_huber=False, huber_threshold=1.0): + def __init__(self, + q_t, + q_tp1, + q_tp0, + importance_weights, + rewards, + done_mask, + gamma=0.99, + n_step=1, + use_huber=False, + huber_threshold=1.0): q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) - q_tp1_best = tf.squeeze( - input=q_tp1, axis=len(q_tp1.shape) - 1) + q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best # compute RHS of bellman equation @@ -131,27 +143,20 @@ class DDPGPolicyGraph(TFPolicyGraph): def _build_q_network(obs, actions): return QNetwork( - ModelCatalog.get_model(obs, 1, config["model"]), - actions, + ModelCatalog.get_model(obs, 1, config["model"]), actions, config["critic_hiddens"], config["critic_hidden_activation"]).value def _build_p_network(obs): return PNetwork( - ModelCatalog.get_model(obs, 1, config["model"]), - dim_actions, + ModelCatalog.get_model(obs, 1, config["model"]), dim_actions, config["actor_hiddens"], config["actor_hidden_activation"]).action_scores def _build_action_network(p_values, stochastic, eps): - return ActionNetwork( - p_values, - low_action, - high_action, - stochastic, - eps, - config["exploration_theta"], - config["exploration_sigma"]).actions + return ActionNetwork(p_values, low_action, high_action, stochastic, + eps, config["exploration_theta"], + config["exploration_sigma"]).actions # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") @@ -263,9 +268,13 @@ class DDPGPolicyGraph(TFPolicyGraph): ("weights", self.importance_weights), ] TFPolicyGraph.__init__( - self, observation_space, action_space, self.sess, + self, + observation_space, + action_space, + self.sess, obs_input=self.cur_observations, - action_sampler=self.output_actions, loss=self.loss.total_loss, + action_sampler=self.output_actions, + loss=self.loss.total_loss, loss_inputs=self.loss_inputs) self.sess.run(tf.global_variables_initializer()) @@ -294,10 +303,10 @@ class DDPGPolicyGraph(TFPolicyGraph): self.loss.actor_loss, var_list=self.p_func_vars) critic_grads_and_vars = self.critic_optimizer.compute_gradients( self.loss.critic_loss, var_list=self.q_func_vars) - actor_grads_and_vars = [ - (g, v) for (g, v) in actor_grads_and_vars if g is not None] - critic_grads_and_vars = [ - (g, v) for (g, v) in critic_grads_and_vars if g is not None] + actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars + if g is not None] + critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars + if g is not None] grads_and_vars = actor_grads_and_vars + critic_grads_and_vars return grads_and_vars diff --git a/python/ray/rllib/agents/dqn/apex.py b/python/ray/rllib/agents/dqn/apex.py index 1c8b2f6b3..9321a70ff 100644 --- a/python/ray/rllib/agents/dqn/apex.py +++ b/python/ray/rllib/agents/dqn/apex.py @@ -10,13 +10,12 @@ APEX_DEFAULT_CONFIG = merge_dicts( DQN_CONFIG, { "optimizer_class": "AsyncSamplesOptimizer", - "optimizer": - merge_dicts( - DQN_CONFIG["optimizer"], { - "max_weight_sync_delay": 400, - "num_replay_buffer_shards": 4, - "debug": False - }), + "optimizer": merge_dicts( + DQN_CONFIG["optimizer"], { + "max_weight_sync_delay": 400, + "num_replay_buffer_shards": 4, + "debug": False + }), "n_step": 3, "gpu": True, "num_workers": 32, diff --git a/python/ray/rllib/agents/dqn/dqn.py b/python/ray/rllib/agents/dqn/dqn.py index adb4e427b..197831c1f 100644 --- a/python/ray/rllib/agents/dqn/dqn.py +++ b/python/ray/rllib/agents/dqn/dqn.py @@ -13,11 +13,11 @@ from ray.rllib.evaluation.metrics import collect_metrics from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule from ray.tune.trial import Resources - OPTIMIZER_SHARED_CONFIGS = [ "buffer_size", "prioritized_replay", "prioritized_replay_alpha", "prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size", - "train_batch_size", "learning_starts", "clip_rewards"] + "train_batch_size", "learning_starts", "clip_rewards" +] DEFAULT_CONFIG = with_common_config({ # === Model === @@ -110,7 +110,8 @@ class DQNAgent(Agent): def default_resource_request(cls, config): cf = dict(cls._default_config, **config) return Resources( - cpu=1, gpu=cf["gpu"] and 1 or 0, + cpu=1, + gpu=cf["gpu"] and 1 or 0, extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) @@ -123,7 +124,8 @@ class DQNAgent(Agent): self.exploration0 = self._make_exploration_schedule(0) self.explorations = [ self._make_exploration_schedule(i) - for i in range(self.config["num_workers"])] + for i in range(self.config["num_workers"]) + ] for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer"]: @@ -132,9 +134,10 @@ class DQNAgent(Agent): self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( - self.env_creator, self._policy_graph, self.config["num_workers"], - {"num_cpus": self.config["num_cpus_per_worker"], - "num_gpus": self.config["num_gpus_per_worker"]}) + self.env_creator, self._policy_graph, self.config["num_workers"], { + "num_cpus": self.config["num_cpus_per_worker"], + "num_gpus": self.config["num_gpus_per_worker"] + }) self.optimizer = getattr(optimizers, self.config["optimizer_class"])( self.local_evaluator, self.remote_evaluators, self.config["optimizer"]) @@ -147,14 +150,12 @@ class DQNAgent(Agent): if self.config["per_worker_exploration"]: assert self.config["num_workers"] > 1, \ "This requires multiple workers" - return ConstantSchedule( - 0.4 ** ( - 1 + worker_index / float( - self.config["num_workers"] - 1) * 7)) + exponent = ( + 1 + worker_index / float(self.config["num_workers"] - 1) * 7) + return ConstantSchedule(0.4**exponent) return LinearSchedule( - schedule_timesteps=int( - self.config["exploration_fraction"] * - self.config["schedule_max_timesteps"]), + schedule_timesteps=int(self.config["exploration_fraction"] * + self.config["schedule_max_timesteps"]), initial_p=1.0, final_p=self.config["exploration_final_eps"]) @@ -191,8 +192,8 @@ class DQNAgent(Agent): self.local_evaluator, self.remote_evaluators[-len(self.remote_evaluators) // 3:]) else: - result = collect_metrics( - self.local_evaluator, self.remote_evaluators) + result = collect_metrics(self.local_evaluator, + self.remote_evaluators) return result._replace( timesteps_this_iter=self.global_timestep - start_timestep, @@ -208,14 +209,14 @@ class DQNAgent(Agent): ev.__ray_terminate__.remote() def _save(self, checkpoint_dir): - checkpoint_path = os.path.join( - checkpoint_dir, "checkpoint-{}".format(self.iteration)) + checkpoint_path = os.path.join(checkpoint_dir, + "checkpoint-{}".format(self.iteration)) extra_data = [ self.local_evaluator.save(), ray.get([e.save.remote() for e in self.remote_evaluators]), - self.optimizer.save(), - self.num_target_updates, - self.last_target_update_ts] + self.optimizer.save(), self.num_target_updates, + self.last_target_update_ts + ] pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb")) return checkpoint_path @@ -223,8 +224,9 @@ class DQNAgent(Agent): extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb")) self.local_evaluator.restore(extra_data[0]) ray.get([ - e.restore.remote(d) for (d, e) - in zip(extra_data[1], self.remote_evaluators)]) + e.restore.remote(d) + for (d, e) in zip(extra_data[1], self.remote_evaluators) + ]) self.optimizer.restore(extra_data[2]) self.num_target_updates = extra_data[3] self.last_target_update_ts = extra_data[4] diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy_graph.py index 7905935ce..f553ad325 100644 --- a/python/ray/rllib/agents/dqn/dqn_policy_graph.py +++ b/python/ray/rllib/agents/dqn/dqn_policy_graph.py @@ -13,7 +13,6 @@ from ray.rllib.evaluation.sample_batch import SampleBatch from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph - Q_SCOPE = "q_func" Q_TARGET_SCOPE = "target_q_func" @@ -33,7 +32,8 @@ class QNetwork(object): state_out = model.last_layer for hidden in hiddens: state_out = layers.fully_connected( - state_out, num_outputs=hidden, + state_out, + num_outputs=hidden, activation_fn=tf.nn.relu) state_score = layers.fully_connected( state_out, num_outputs=1, activation_fn=None) @@ -50,26 +50,32 @@ class QValuePolicy(object): deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations)[0] random_actions = tf.random_uniform( - tf.stack([batch_size]), minval=0, maxval=num_actions, + tf.stack([batch_size]), + minval=0, + maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps - stochastic_actions = tf.where( - chose_random, random_actions, deterministic_actions) - self.action = tf.cond( - stochastic, lambda: stochastic_actions, - lambda: deterministic_actions) + stochastic_actions = tf.where(chose_random, random_actions, + deterministic_actions) + self.action = tf.cond(stochastic, lambda: stochastic_actions, + lambda: deterministic_actions) class QLoss(object): - def __init__( - self, q_t_selected, q_tp1_best, importance_weights, rewards, - done_mask, gamma=0.99, n_step=1): + def __init__(self, + q_t_selected, + q_tp1_best, + importance_weights, + rewards, + done_mask, + gamma=0.99, + n_step=1): q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best # compute RHS of bellman equation - q_t_selected_target = rewards + gamma ** n_step * q_tp1_best_masked + q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked # compute the error (potentially clipped) self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) @@ -91,14 +97,14 @@ class DQNPolicyGraph(TFPolicyGraph): def _build_q_network(obs): return QNetwork( - ModelCatalog.get_model(obs, 1, config["model"]), - num_actions, config["dueling"], config["hiddens"]).value + ModelCatalog.get_model(obs, 1, config["model"]), num_actions, + config["dueling"], config["hiddens"]).value # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder( - tf.float32, shape=(None,) + observation_space.shape) + tf.float32, shape=(None, ) + observation_space.shape) # Action Q network with tf.variable_scope(Q_SCOPE) as scope: @@ -106,20 +112,17 @@ class DQNPolicyGraph(TFPolicyGraph): self.q_func_vars = _scope_vars(scope.name) # Action outputs - self.output_actions = QValuePolicy( - q_values, - self.cur_observations, - num_actions, - self.stochastic, - self.eps).action + self.output_actions = QValuePolicy(q_values, self.cur_observations, + num_actions, self.stochastic, + self.eps).action # Replay inputs self.obs_t = tf.placeholder( - tf.float32, shape=(None,) + observation_space.shape) + tf.float32, shape=(None, ) + observation_space.shape) self.act_t = tf.placeholder(tf.int32, [None], name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder( - tf.float32, shape=(None,) + observation_space.shape) + tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder( tf.float32, [None], name="weight") @@ -134,8 +137,8 @@ class DQNPolicyGraph(TFPolicyGraph): self.target_q_func_vars = _scope_vars(scope.name) # q scores for actions which we know were selected in the given state. - q_t_selected = tf.reduce_sum( - q_t * tf.one_hot(self.act_t, num_actions), 1) + q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions), + 1) # compute estimate of best possible value starting from state at t + 1 if config["double_q"]: @@ -143,20 +146,20 @@ class DQNPolicyGraph(TFPolicyGraph): q_tp1_using_online_net = _build_q_network(self.obs_tp1) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( - q_tp1 * tf.one_hot( - q_tp1_best_using_online_net, num_actions), 1) + q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), + 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) - self.loss = QLoss( - q_t_selected, q_tp1_best, self.importance_weights, - self.rew_t, self.done_mask, config["gamma"], config["n_step"]) + self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights, + self.rew_t, self.done_mask, config["gamma"], + config["n_step"]) # update_target_fn will be called periodically to copy Q network to # target Q network update_target_expr = [] for var, var_target in zip( - sorted(self.q_func_vars, key=lambda v: v.name), + sorted(self.q_func_vars, key=lambda v: v.name), sorted(self.target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) self.update_target_expr = tf.group(*update_target_expr) @@ -172,9 +175,13 @@ class DQNPolicyGraph(TFPolicyGraph): ("weights", self.importance_weights), ] TFPolicyGraph.__init__( - self, observation_space, action_space, self.sess, + self, + observation_space, + action_space, + self.sess, obs_input=self.cur_observations, - action_sampler=self.output_actions, loss=self.loss.loss, + action_sampler=self.output_actions, + loss=self.loss.loss, loss_inputs=self.loss_inputs) self.sess.run(tf.global_variables_initializer()) @@ -184,13 +191,14 @@ class DQNPolicyGraph(TFPolicyGraph): def gradients(self, optimizer): if self.config["grad_norm_clipping"] is not None: grads_and_vars = _minimize_and_clip( - optimizer, self.loss.loss, var_list=self.q_func_vars, + optimizer, + self.loss.loss, + var_list=self.q_func_vars, clip_val=self.config["grad_norm_clipping"]) else: grads_and_vars = optimizer.compute_gradients( self.loss.loss, var_list=self.q_func_vars) - grads_and_vars = [ - (g, v) for (g, v) in grads_and_vars if g is not None] + grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None] return grads_and_vars def extra_compute_action_feed_dict(self): @@ -207,8 +215,8 @@ class DQNPolicyGraph(TFPolicyGraph): def postprocess_trajectory(self, sample_batch, other_agent_batches=None): return _postprocess_dqn(self, sample_batch) - def compute_td_error( - self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights): + def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask, + importance_weights): td_err = self.sess.run( self.loss.td_error, feed_dict={ @@ -254,7 +262,7 @@ def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones): continue # episode end for j in range(1, n_step): new_obs[i] = new_obs[i + j] - rewards[i] += gamma ** j * rewards[i + j] + rewards[i] += gamma**j * rewards[i + j] if dones[i + j]: break # episode end # truncate ends of the trajectory @@ -266,24 +274,29 @@ def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones): def _postprocess_dqn(policy_graph, sample_batch): obs, actions, rewards, new_obs, dones = [ list(x) for x in sample_batch.columns( - ["obs", "actions", "rewards", "new_obs", "dones"])] + ["obs", "actions", "rewards", "new_obs", "dones"]) + ] # N-step Q adjustments if policy_graph.config["n_step"] > 1: - adjust_nstep( - policy_graph.config["n_step"], policy_graph.config["gamma"], - obs, actions, rewards, new_obs, dones) + adjust_nstep(policy_graph.config["n_step"], + policy_graph.config["gamma"], obs, actions, rewards, + new_obs, dones) batch = SampleBatch({ - "obs": obs, "actions": actions, "rewards": rewards, - "new_obs": new_obs, "dones": dones, - "weights": np.ones_like(rewards)}) + "obs": obs, + "actions": actions, + "rewards": rewards, + "new_obs": new_obs, + "dones": dones, + "weights": np.ones_like(rewards) + }) # Prioritize on the worker side if batch.count > 0 and policy_graph.config["worker_side_prioritization"]: td_errors = policy_graph.compute_td_error( - batch["obs"], batch["actions"], batch["rewards"], - batch["new_obs"], batch["dones"], batch["weights"]) + batch["obs"], batch["actions"], batch["rewards"], batch["new_obs"], + batch["dones"], batch["weights"]) new_priorities = ( np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"]) batch.data["weights"] = new_priorities @@ -295,8 +308,7 @@ def _huber_loss(x, delta=1.0): """Reference: https://en.wikipedia.org/wiki/Huber_loss""" return tf.where( tf.abs(x) < delta, - tf.square(x) * 0.5, - delta * (tf.abs(x) - 0.5 * delta)) + tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta)) def _minimize_and_clip(optimizer, objective, var_list, clip_val=10): diff --git a/python/ray/rllib/agents/es/es.py b/python/ray/rllib/agents/es/es.py index 62249e380..a2a39e612 100644 --- a/python/ray/rllib/agents/es/es.py +++ b/python/ray/rllib/agents/es/es.py @@ -20,13 +20,11 @@ from ray.rllib.agents.es import policies from ray.rllib.agents.es import tabular_logger as tlogger from ray.rllib.agents.es import utils - Result = namedtuple("Result", [ "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths", "eval_returns", "eval_lengths" ]) - DEFAULT_CONFIG = { 'l2_coeff': 0.005, 'noise_stdev': 0.02, @@ -64,7 +62,11 @@ class SharedNoiseTable(object): @ray.remote class Worker(object): - def __init__(self, config, policy_params, env_creator, noise, + def __init__(self, + config, + policy_params, + env_creator, + noise, min_task_runtime=0.2): self.min_task_runtime = min_task_runtime self.config = config @@ -82,7 +84,9 @@ class Worker(object): def rollout(self, timestep_limit, add_noise=True): rollout_rewards, rollout_length = policies.rollout( - self.policy, self.env, timestep_limit=timestep_limit, + self.policy, + self.env, + timestep_limit=timestep_limit, add_noise=add_noise) return rollout_rewards, rollout_length @@ -95,8 +99,8 @@ class Worker(object): # Perform some rollouts with noise. task_tstart = time.time() - while (len(noise_indices) == 0 or - time.time() - task_tstart < self.min_task_runtime): + while (len(noise_indices) == 0 + or time.time() - task_tstart < self.min_task_runtime): if np.random.uniform() < self.config["eval_prob"]: # Do an evaluation run with no perturbation. @@ -122,7 +126,8 @@ class Worker(object): noise_indices.append(noise_index) returns.append([rewards_pos.sum(), rewards_neg.sum()]) sign_returns.append( - [np.sign(rewards_pos).sum(), np.sign(rewards_neg).sum()]) + [np.sign(rewards_pos).sum(), + np.sign(rewards_neg).sum()]) lengths.append([lengths_pos, lengths_neg]) return Result( @@ -146,9 +151,7 @@ class ESAgent(Agent): return Resources(cpu=1, gpu=0, extra_cpu=cf["num_workers"]) def _init(self): - policy_params = { - "action_noise_std": 0.01 - } + policy_params = {"action_noise_std": 0.01} env = self.env_creator(self.config["env_config"]) from ray.rllib import models @@ -168,9 +171,9 @@ class ESAgent(Agent): # Create the actors. print("Creating actors.") self.workers = [ - Worker.remote( - self.config, policy_params, self.env_creator, noise_id) - for _ in range(self.config["num_workers"])] + Worker.remote(self.config, policy_params, self.env_creator, + noise_id) for _ in range(self.config["num_workers"]) + ] self.episodes_so_far = 0 self.timesteps_so_far = 0 @@ -180,21 +183,20 @@ class ESAgent(Agent): num_episodes, num_timesteps = 0, 0 results = [] while num_episodes < min_episodes or num_timesteps < min_timesteps: - print( - "Collected {} episodes {} timesteps so far this iter".format( - num_episodes, num_timesteps)) - rollout_ids = [worker.do_rollouts.remote(theta_id) - for worker in self.workers] + print("Collected {} episodes {} timesteps so far this iter".format( + num_episodes, num_timesteps)) + rollout_ids = [ + worker.do_rollouts.remote(theta_id) for worker in self.workers + ] # Get the results of the rollouts. for result in ray.get(rollout_ids): results.append(result) # Update the number of episodes and the number of timesteps # keeping in mind that result.noisy_lengths is a list of lists, # where the inner lists have length 2. - num_episodes += sum(len(pair) for pair - in result.noisy_lengths) - num_timesteps += sum(sum(pair) for pair - in result.noisy_lengths) + num_episodes += sum(len(pair) for pair in result.noisy_lengths) + num_timesteps += sum( + sum(pair) for pair in result.noisy_lengths) return results, num_episodes, num_timesteps def _train(self): @@ -209,8 +211,7 @@ class ESAgent(Agent): # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( - theta_id, - config["episodes_per_batch"], + theta_id, config["episodes_per_batch"], config["timesteps_per_batch"]) all_noise_indices = [] @@ -255,13 +256,11 @@ class ESAgent(Agent): for index in noise_indices), batch_size=500) g /= noisy_returns.size - assert ( - g.shape == (self.policy.num_params,) and - g.dtype == np.float32 and - count == len(noise_indices)) + assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32 + and count == len(noise_indices)) # Compute the new weights theta. - theta, update_ratio = self.optimizer.update( - -g + config["l2_coeff"] * theta) + theta, update_ratio = self.optimizer.update(-g + + config["l2_coeff"] * theta) # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) @@ -313,13 +312,10 @@ class ESAgent(Agent): w.__ray_terminate__.remote() def _save(self, checkpoint_dir): - checkpoint_path = os.path.join( - checkpoint_dir, "checkpoint-{}".format(self.iteration)) + checkpoint_path = os.path.join(checkpoint_dir, + "checkpoint-{}".format(self.iteration)) weights = self.policy.get_weights() - objects = [ - weights, - self.episodes_so_far, - self.timesteps_so_far] + objects = [weights, self.episodes_so_far, self.timesteps_so_far] pickle.dump(objects, open(checkpoint_path, "wb")) return checkpoint_path diff --git a/python/ray/rllib/agents/es/optimizers.py b/python/ray/rllib/agents/es/optimizers.py index f5ef4e109..3b48f7393 100644 --- a/python/ray/rllib/agents/es/optimizers.py +++ b/python/ray/rllib/agents/es/optimizers.py @@ -48,8 +48,8 @@ class Adam(Optimizer): self.v = np.zeros(self.dim, dtype=np.float32) def _compute_step(self, globalg): - a = self.stepsize * (np.sqrt(1 - self.beta2 ** self.t) / - (1 - self.beta1 ** self.t)) + a = self.stepsize * (np.sqrt(1 - self.beta2**self.t) / + (1 - self.beta1**self.t)) self.m = self.beta1 * self.m + (1 - self.beta1) * globalg self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg) step = -a * self.m / (np.sqrt(self.v) + self.epsilon) diff --git a/python/ray/rllib/agents/es/policies.py b/python/ray/rllib/agents/es/policies.py index eb492373f..d62fee43c 100644 --- a/python/ray/rllib/agents/es/policies.py +++ b/python/ray/rllib/agents/es/policies.py @@ -21,8 +21,8 @@ def rollout(policy, env, timestep_limit=None, add_noise=False): noise drawn from that stream. Otherwise, no action noise will be added. """ env_timestep_limit = env.spec.max_episode_steps - timestep_limit = (env_timestep_limit if timestep_limit is None - else min(timestep_limit, env_timestep_limit)) + timestep_limit = (env_timestep_limit if timestep_limit is None else min( + timestep_limit, env_timestep_limit)) rews = [] t = 0 observation = env.reset() @@ -38,16 +38,16 @@ def rollout(policy, env, timestep_limit=None, add_noise=False): class GenericPolicy(object): - def __init__(self, sess, action_space, preprocessor, - observation_filter, action_noise_std): + def __init__(self, sess, action_space, preprocessor, observation_filter, + action_noise_std): self.sess = sess self.action_space = action_space self.action_noise_std = action_noise_std self.preprocessor = preprocessor - self.observation_filter = get_filter( - observation_filter, self.preprocessor.shape) - self.inputs = tf.placeholder( - tf.float32, [None] + list(self.preprocessor.shape)) + self.observation_filter = get_filter(observation_filter, + self.preprocessor.shape) + self.inputs = tf.placeholder(tf.float32, + [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( @@ -59,16 +59,16 @@ class GenericPolicy(object): self.variables = ray.experimental.TensorFlowVariables( model.outputs, self.sess) - self.num_params = sum(np.prod(variable.shape.as_list()) - for _, variable - in self.variables.variables.items()) + self.num_params = sum( + np.prod(variable.shape.as_list()) + for _, variable in self.variables.variables.items()) self.sess.run(tf.global_variables_initializer()) def compute(self, observation, add_noise=False, update=True): observation = self.preprocessor.transform(observation) observation = self.observation_filter(observation[None], update=update) - action = self.sess.run(self.sampler, - feed_dict={self.inputs: observation}) + action = self.sess.run( + self.sampler, feed_dict={self.inputs: observation}) if add_noise and isinstance(self.action_space, gym.spaces.Box): action += np.random.randn(*action.shape) * self.action_noise_std return action diff --git a/python/ray/rllib/agents/es/tabular_logger.py b/python/ray/rllib/agents/es/tabular_logger.py index 80e7b5b37..1463e59e0 100644 --- a/python/ray/rllib/agents/es/tabular_logger.py +++ b/python/ray/rllib/agents/es/tabular_logger.py @@ -25,6 +25,7 @@ DISABLED = 50 class TbWriter(object): """Based on SummaryWriter, but changed to allow for a different prefix.""" + def __init__(self, dir, prefix): self.dir = dir # Start at 1, because EvWriter automatically generates an object with @@ -34,9 +35,10 @@ class TbWriter(object): compat.as_bytes(os.path.join(dir, prefix))) def write_values(self, key2val): - summary = tf.Summary(value=[tf.Summary.Value(tag=k, - simple_value=float(v)) - for (k, v) in key2val.items()]) + summary = tf.Summary(value=[ + tf.Summary.Value(tag=k, simple_value=float(v)) + for (k, v) in key2val.items() + ]) event = event_pb2.Event(wall_time=time.time(), summary=summary) event.step = self.step self.evwriter.WriteEvent(event) @@ -46,6 +48,7 @@ class TbWriter(object): def close(self): self.evwriter.Close() + # API @@ -126,6 +129,7 @@ def get_expt_dir(): sys.stderr.write("get_expt_dir() is Deprecated. Switch to get_dir()\n") return get_dir() + # Backend @@ -167,8 +171,8 @@ class _Logger(object): # Write to all text outputs self._write_text("-" * (keywidth + valwidth + 7), "\n") for (key, val) in key2str.items(): - self._write_text("| ", key, " " * (keywidth - len(key)), - " | ", val, " " * (valwidth - len(val)), " |\n") + self._write_text("| ", key, " " * (keywidth - len(key)), " | ", + val, " " * (valwidth - len(val)), " |\n") self._write_text("-" * (keywidth + valwidth + 7), "\n") for f in self.text_outputs: try: @@ -202,7 +206,7 @@ class _Logger(object): # Misc def _do_log(self, *args): - self._write_text(*args + ('\n',)) + self._write_text(*args + ('\n', )) for f in self.text_outputs: try: f.flush() diff --git a/python/ray/rllib/agents/es/utils.py b/python/ray/rllib/agents/es/utils.py index 6ea5d31ac..1575e46c3 100644 --- a/python/ray/rllib/agents/es/utils.py +++ b/python/ray/rllib/agents/es/utils.py @@ -31,8 +31,9 @@ def compute_centered_ranks(x): def make_session(single_threaded): if not single_threaded: return tf.Session() - return tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=1, - intra_op_parallelism_threads=1)) + return tf.Session( + config=tf.ConfigProto( + inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)) def itergroups(items, group_size): @@ -50,10 +51,11 @@ def itergroups(items, group_size): def batched_weighted_sum(weights, vecs, batch_size): total = 0 num_items_summed = 0 - for batch_weights, batch_vecs in zip(itergroups(weights, batch_size), - itergroups(vecs, batch_size)): + for batch_weights, batch_vecs in zip( + itergroups(weights, batch_size), itergroups(vecs, batch_size)): assert len(batch_weights) == len(batch_vecs) <= batch_size - total += np.dot(np.asarray(batch_weights, dtype=np.float32), - np.asarray(batch_vecs, dtype=np.float32)) + total += np.dot( + np.asarray(batch_weights, dtype=np.float32), + np.asarray(batch_vecs, dtype=np.float32)) num_items_summed += len(batch_weights) return total, num_items_summed diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py index 0bd4c33b4..c66146832 100644 --- a/python/ray/rllib/agents/pg/pg.py +++ b/python/ray/rllib/agents/pg/pg.py @@ -7,7 +7,6 @@ from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph from ray.rllib.optimizers import SyncSamplesOptimizer from ray.tune.trial import Resources - DEFAULT_CONFIG = with_common_config({ # No remote workers by default "num_workers": 0, @@ -43,9 +42,9 @@ class PGAgent(Agent): self.env_creator, PGPolicyGraph) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, PGPolicyGraph, self.config["num_workers"], {}) - self.optimizer = SyncSamplesOptimizer( - self.local_evaluator, self.remote_evaluators, - self.config["optimizer"]) + self.optimizer = SyncSamplesOptimizer(self.local_evaluator, + self.remote_evaluators, + self.config["optimizer"]) def _train(self): prev_steps = self.optimizer.num_steps_sampled diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index cbd9b2745..bb831c47d 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -42,9 +42,15 @@ class PGPolicyGraph(TFPolicyGraph): ] TFPolicyGraph.__init__( - self, obs_space, action_space, sess, obs_input=obs, - action_sampler=action_dist.sample(), loss=loss, - loss_inputs=loss_in, state_inputs=self.model.state_in, + self, + obs_space, + action_space, + sess, + obs_input=obs, + action_sampler=action_dist.sample(), + loss=loss, + loss_inputs=loss_in, + state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py index 2f8b403aa..120619d47 100644 --- a/python/ray/rllib/agents/ppo/ppo.py +++ b/python/ray/rllib/agents/ppo/ppo.py @@ -77,28 +77,30 @@ class PPOAgent(Agent): self.local_evaluator = self.make_local_evaluator( self.env_creator, PPOPolicyGraph) self.remote_evaluators = self.make_remote_evaluators( - self.env_creator, PPOPolicyGraph, self.config["num_workers"], - {"num_cpus": self.config["num_cpus_per_worker"], - "num_gpus": self.config["num_gpus_per_worker"]}) + self.env_creator, PPOPolicyGraph, self.config["num_workers"], { + "num_cpus": self.config["num_cpus_per_worker"], + "num_gpus": self.config["num_gpus_per_worker"] + }) if self.config["simple_optimizer"]: self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, {"num_sgd_iter": self.config["num_sgd_iter"]}) else: self.optimizer = LocalMultiGPUOptimizer( - self.local_evaluator, self.remote_evaluators, - {"sgd_batch_size": self.config["sgd_batchsize"], - "sgd_stepsize": self.config["sgd_stepsize"], - "num_sgd_iter": self.config["num_sgd_iter"], - "timesteps_per_batch": self.config["timesteps_per_batch"], - "standardize_fields": ["advantages"]}) + self.local_evaluator, self.remote_evaluators, { + "sgd_batch_size": self.config["sgd_batchsize"], + "sgd_stepsize": self.config["sgd_stepsize"], + "num_sgd_iter": self.config["num_sgd_iter"], + "timesteps_per_batch": self.config["timesteps_per_batch"], + "standardize_fields": ["advantages"] + }) def _train(self): prev_steps = self.optimizer.num_steps_sampled fetches = self.optimizer.step() self.local_evaluator.for_policy(lambda pi: pi.update_kl(fetches["kl"])) - FilterManager.synchronize( - self.local_evaluator.filters, self.remote_evaluators) + FilterManager.synchronize(self.local_evaluator.filters, + self.remote_evaluators) res = self.optimizer.collect_metrics() res = res._replace( timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, @@ -115,9 +117,7 @@ class PPOAgent(Agent): "checkpoint-{}".format(self.iteration)) agent_state = ray.get( [a.save.remote() for a in self.remote_evaluators]) - extra_data = [ - self.local_evaluator.save(), - agent_state] + extra_data = [self.local_evaluator.save(), agent_state] pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb")) return checkpoint_path @@ -126,4 +126,5 @@ class PPOAgent(Agent): self.local_evaluator.restore(extra_data[0]) ray.get([ a.restore.remote(o) - for (a, o) in zip(self.remote_evaluators, extra_data[1])]) + for (a, o) in zip(self.remote_evaluators, extra_data[1]) + ]) diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py index 2bc6d5507..df3444318 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py @@ -10,10 +10,20 @@ from ray.rllib.models.catalog import ModelCatalog class PPOLoss(object): - def __init__( - self, action_space, value_targets, advantages, actions, logits, - vf_preds, curr_action_dist, value_fn, cur_kl_coeff, - entropy_coeff=0, clip_param=0.1, vf_loss_coeff=1.0, use_gae=True): + def __init__(self, + action_space, + value_targets, + advantages, + actions, + logits, + vf_preds, + curr_action_dist, + value_fn, + cur_kl_coeff, + entropy_coeff=0, + clip_param=0.1, + vf_loss_coeff=1.0, + use_gae=True): """Constructs the loss for Proximal Policy Objective. Arguments: @@ -51,31 +61,33 @@ class PPOLoss(object): surrogate_loss = tf.minimum( advantages * logp_ratio, - advantages * tf.clip_by_value( - logp_ratio, 1 - clip_param, 1 + clip_param)) + advantages * tf.clip_by_value(logp_ratio, 1 - clip_param, + 1 + clip_param)) self.mean_policy_loss = tf.reduce_mean(-surrogate_loss) if use_gae: vf_loss1 = tf.square(value_fn - value_targets) - vf_clipped = vf_preds + tf.clip_by_value( - value_fn - vf_preds, -clip_param, clip_param) + vf_clipped = vf_preds + tf.clip_by_value(value_fn - vf_preds, + -clip_param, clip_param) vf_loss2 = tf.square(vf_clipped - value_targets) vf_loss = tf.maximum(vf_loss1, vf_loss2) self.mean_vf_loss = tf.reduce_mean(vf_loss) - loss = tf.reduce_mean( - -surrogate_loss + cur_kl_coeff*action_kl + - vf_loss_coeff*vf_loss - entropy_coeff*curr_entropy) + loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl + + vf_loss_coeff * vf_loss - + entropy_coeff * curr_entropy) else: self.mean_vf_loss = tf.constant(0.0) - loss = tf.reduce_mean( - -surrogate_loss + cur_kl_coeff*action_kl - - entropy_coeff*curr_entropy) + loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl - + entropy_coeff * curr_entropy) self.loss = loss class PPOPolicyGraph(TFPolicyGraph): - def __init__(self, observation_space, action_space, - config, existing_inputs=None): + def __init__(self, + observation_space, + action_space, + config, + existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. @@ -98,16 +110,18 @@ class PPOPolicyGraph(TFPolicyGraph): existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder( - tf.float32, name="obs", shape=(None,)+observation_space.shape) + tf.float32, + name="obs", + shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder( - tf.float32, name="advantages", shape=(None,)) + tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder( tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder( - tf.float32, name="vf_preds", shape=(None,)) + tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder( - tf.float32, name="value_targets", shape=(None,)) + tf.float32, name="value_targets", shape=(None, )) existing_state_in = None existing_seq_lens = None @@ -120,13 +134,19 @@ class PPOPolicyGraph(TFPolicyGraph): ("vf_preds", vf_preds_ph), ] self.model = ModelCatalog.get_model( - obs_ph, logit_dim, self.config["model"], - state_in=existing_state_in, seq_lens=existing_seq_lens) + obs_ph, + logit_dim, + self.config["model"], + state_in=existing_state_in, + seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable( initializer=tf.constant_initializer(self.kl_coeff_val), - name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) + name="kl_coeff", + shape=(), + trainable=False, + dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) @@ -146,20 +166,32 @@ class PPOPolicyGraph(TFPolicyGraph): self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) self.loss_obj = PPOLoss( - action_space, value_targets_ph, adv_ph, act_ph, - logits_ph, vf_preds_ph, - curr_action_dist, self.value_function, self.kl_coeff, + action_space, + value_targets_ph, + adv_ph, + act_ph, + logits_ph, + vf_preds_ph, + curr_action_dist, + self.value_function, + self.kl_coeff, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_loss_coeff=self.config["kl_target"], use_gae=self.config["use_gae"]) TFPolicyGraph.__init__( - self, observation_space, action_space, - self.sess, obs_input=obs_ph, - action_sampler=self.sampler, loss=self.loss_obj.loss, - loss_inputs=self.loss_in, state_inputs=self.model.state_in, - state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, + self, + observation_space, + action_space, + self.sess, + obs_input=obs_ph, + action_sampler=self.sampler, + loss=self.loss_obj.loss, + loss_inputs=self.loss_in, + state_inputs=self.model.state_in, + state_outputs=self.model.state_out, + seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) @@ -167,7 +199,9 @@ class PPOPolicyGraph(TFPolicyGraph): def copy(self, existing_inputs): """Creates a copy of self using existing input placeholders.""" return PPOPolicyGraph( - None, self.action_space, self.config, + None, + self.action_space, + self.config, existing_inputs=existing_inputs) def extra_compute_action_fetches(self): @@ -193,8 +227,11 @@ class PPOPolicyGraph(TFPolicyGraph): def postprocess_trajectory(self, sample_batch, other_agent_batches=None): last_r = 0.0 batch = compute_advantages( - sample_batch, last_r, self.config["gamma"], - self.config["lambda"], use_gae=self.config["use_gae"]) + sample_batch, + last_r, + self.config["gamma"], + self.config["lambda"], + use_gae=self.config["use_gae"]) return batch def optimizer(self): diff --git a/python/ray/rllib/agents/ppo/test/test.py b/python/ray/rllib/agents/ppo/test/test.py index d6454eb56..432b22f9a 100644 --- a/python/ray/rllib/agents/ppo/test/test.py +++ b/python/ray/rllib/agents/ppo/test/test.py @@ -13,7 +13,6 @@ from ray.rllib.agents.ppo.utils import flatten, concatenate # TODO(ekl): move to rllib/models dir class DistributionsTest(unittest.TestCase): - def testCategorical(self): num_samples = 100000 logits = tf.placeholder(tf.float32, shape=(None, 10)) @@ -32,10 +31,11 @@ class DistributionsTest(unittest.TestCase): class UtilsTest(unittest.TestCase): - def testFlatten(self): - d = {"s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]), - "a": np.array([[[5], [-5]], [[6], [-6]]])} + d = { + "s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]), + "a": np.array([[[5], [-5]], [[6], [-6]]]) + } flat = flatten(d.copy(), start=0, stop=2) assert_allclose(d["s"][0][0][:], flat["s"][0][:]) assert_allclose(d["s"][0][1][:], flat["s"][1][:]) diff --git a/python/ray/rllib/agents/ppo/utils.py b/python/ray/rllib/agents/ppo/utils.py index 5e8ac5a3a..e97dce5cf 100644 --- a/python/ray/rllib/agents/ppo/utils.py +++ b/python/ray/rllib/agents/ppo/utils.py @@ -16,7 +16,7 @@ def flatten(weights, start=0, stop=2): stop: The ending index. """ for key, val in weights.items(): - new_shape = val.shape[0:start] + (-1,) + val.shape[stop:] + new_shape = val.shape[0:start] + (-1, ) + val.shape[stop:] weights[key] = val.reshape(new_shape) return weights diff --git a/python/ray/rllib/env/async_vector_env.py b/python/ray/rllib/env/async_vector_env.py index 1d6a9b374..ba0d63c12 100644 --- a/python/ray/rllib/env/async_vector_env.py +++ b/python/ray/rllib/env/async_vector_env.py @@ -286,8 +286,8 @@ class _MultiAgentEnvState(object): self.reset() def poll(self): - obs, rew, dones, info = ( - self.last_obs, self.last_rewards, self.last_dones, self.last_infos) + obs, rew, dones, info = (self.last_obs, self.last_rewards, + self.last_dones, self.last_infos) self.last_obs = {} self.last_rewards = {} self.last_dones = {"__all__": False} @@ -303,10 +303,13 @@ class _MultiAgentEnvState(object): def reset(self): self.last_obs = self.env.reset() self.last_rewards = { - agent_id: None for agent_id in self.last_obs.keys()} + agent_id: None + for agent_id in self.last_obs.keys() + } self.last_dones = { - agent_id: False for agent_id in self.last_obs.keys()} - self.last_infos = { - agent_id: {} for agent_id in self.last_obs.keys()} + agent_id: False + for agent_id in self.last_obs.keys() + } + self.last_infos = {agent_id: {} for agent_id in self.last_obs.keys()} self.last_dones["__all__"] = False return self.last_obs diff --git a/python/ray/rllib/env/atari_wrappers.py b/python/ray/rllib/env/atari_wrappers.py index d9d7beffd..f9bf5b94a 100644 --- a/python/ray/rllib/env/atari_wrappers.py +++ b/python/ray/rllib/env/atari_wrappers.py @@ -28,8 +28,7 @@ class NoopResetEnv(gym.Wrapper): if self.override_num_noops is not None: noops = self.override_num_noops else: - noops = self.unwrapped.np_random.randint( - 1, self.noop_max + 1) + noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) assert noops > 0 obs = None for _ in range(noops): @@ -121,7 +120,7 @@ class MaxAndSkipEnv(gym.Wrapper): gym.Wrapper.__init__(self, env) # most recent raw observations (for max pooling across time steps) self._obs_buffer = np.zeros( - (2,)+env.observation_space.shape, dtype=np.uint8) + (2, ) + env.observation_space.shape, dtype=np.uint8) self._skip = skip def step(self, action): diff --git a/python/ray/rllib/env/vector_env.py b/python/ray/rllib/env/vector_env.py index ef57be859..28791f552 100644 --- a/python/ray/rllib/env/vector_env.py +++ b/python/ray/rllib/env/vector_env.py @@ -71,8 +71,7 @@ class _VectorizedGymEnv(VectorEnv): self.envs = existing_envs self.num_envs = num_envs if make_env and num_envs > 1: - self.resetter = _AsyncResetter( - make_env, int(self.num_envs ** 0.5)) + self.resetter = _AsyncResetter(make_env, int(self.num_envs**0.5)) else: self.resetter = _SimpleResetter(make_env) while len(self.envs) < self.num_envs: diff --git a/python/ray/rllib/evaluation/metrics.py b/python/ray/rllib/evaluation/metrics.py index ceabce7ee..d4d0b5743 100644 --- a/python/ray/rllib/evaluation/metrics.py +++ b/python/ray/rllib/evaluation/metrics.py @@ -15,9 +15,10 @@ def collect_metrics(local_evaluator, remote_evaluators=[]): episode_rewards = [] episode_lengths = [] policy_rewards = collections.defaultdict(list) - metric_lists = ray.get( - [a.apply.remote(lambda ev: ev.sampler.get_metrics()) - for a in remote_evaluators]) + metric_lists = ray.get([ + a.apply.remote(lambda ev: ev.sampler.get_metrics()) + for a in remote_evaluators + ]) metric_lists.append(local_evaluator.sampler.get_metrics()) for metrics in metric_lists: for episode in metrics: diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py index c513389c3..58f121ef4 100644 --- a/python/ray/rllib/evaluation/policy_evaluator.py +++ b/python/ray/rllib/evaluation/policy_evaluator.py @@ -82,24 +82,23 @@ class PolicyEvaluator(EvaluatorInterface): def as_remote(cls, num_cpus=None, num_gpus=None): return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls) - def __init__( - self, - env_creator, - policy_graph, - policy_mapping_fn=None, - tf_session_creator=None, - batch_steps=100, - batch_mode="truncate_episodes", - episode_horizon=None, - preprocessor_pref="rllib", - sample_async=False, - compress_observations=False, - num_envs=1, - observation_filter="NoFilter", - env_config=None, - model_config=None, - policy_config=None, - worker_index=0): + def __init__(self, + env_creator, + policy_graph, + policy_mapping_fn=None, + tf_session_creator=None, + batch_steps=100, + batch_mode="truncate_episodes", + episode_horizon=None, + preprocessor_pref="rllib", + sample_async=False, + compress_observations=False, + num_envs=1, + observation_filter="NoFilter", + env_config=None, + model_config=None, + policy_config=None, + worker_index=0): """Initialize a policy evaluator. Arguments: @@ -157,8 +156,8 @@ class PolicyEvaluator(EvaluatorInterface): policy_config = policy_config or {} self.policy_config = policy_config model_config = model_config or {} - policy_mapping_fn = ( - policy_mapping_fn or (lambda agent_id: DEFAULT_POLICY_ID)) + policy_mapping_fn = (policy_mapping_fn + or (lambda agent_id: DEFAULT_POLICY_ID)) self.env_creator = env_creator self.policy_graph = policy_graph self.batch_steps = batch_steps @@ -170,17 +169,21 @@ class PolicyEvaluator(EvaluatorInterface): isinstance(self.env, ServingEnv) or \ isinstance(self.env, MultiAgentEnv) or \ isinstance(self.env, AsyncVectorEnv): + def wrap(env): return env # we can't auto-wrap these env types elif is_atari(self.env) and \ "custom_preprocessor" not in model_config and \ preprocessor_pref == "deepmind": + def wrap(env): return wrap_deepmind(env, dim=model_config.get("dim", 80)) else: + def wrap(env): return ModelCatalog.get_preprocessor_as_wrapper( env, model_config) + self.env = wrap(self.env) def make_env(): @@ -193,20 +196,21 @@ class PolicyEvaluator(EvaluatorInterface): if tf_session_creator: self.tf_sess = tf_session_creator() else: - self.tf_sess = tf.Session(config=tf.ConfigProto( - gpu_options=tf.GPUOptions(allow_growth=True))) + self.tf_sess = tf.Session( + config=tf.ConfigProto( + gpu_options=tf.GPUOptions(allow_growth=True))) with self.tf_sess.as_default(): self.policy_map = self._build_policy_map( policy_dict, policy_config) else: - self.policy_map = self._build_policy_map( - policy_dict, policy_config) + self.policy_map = self._build_policy_map(policy_dict, + policy_config) self.multiagent = self.policy_map.keys() != set(DEFAULT_POLICY_ID) self.filters = { - policy_id: get_filter( - observation_filter, policy.observation_space.shape) + policy_id: get_filter(observation_filter, + policy.observation_space.shape) for (policy_id, policy) in self.policy_map.items() } @@ -226,24 +230,34 @@ class PolicyEvaluator(EvaluatorInterface): batch_steps = float("inf") # never cut episodes pack_episodes = False # sampler will return 1 episode per poll else: - raise ValueError( - "Unsupported batch mode: {}".format(self.batch_mode)) + raise ValueError("Unsupported batch mode: {}".format( + self.batch_mode)) if sample_async: self.sampler = AsyncSampler( - self.async_env, self.policy_map, policy_mapping_fn, - self.filters, batch_steps, horizon=episode_horizon, - pack=pack_episodes, tf_sess=self.tf_sess) + self.async_env, + self.policy_map, + policy_mapping_fn, + self.filters, + batch_steps, + horizon=episode_horizon, + pack=pack_episodes, + tf_sess=self.tf_sess) self.sampler.start() else: self.sampler = SyncSampler( - self.async_env, self.policy_map, policy_mapping_fn, - self.filters, batch_steps, horizon=episode_horizon, - pack=pack_episodes, tf_sess=self.tf_sess) + self.async_env, + self.policy_map, + policy_mapping_fn, + self.filters, + batch_steps, + horizon=episode_horizon, + pack=pack_episodes, + tf_sess=self.tf_sess) def _build_policy_map(self, policy_dict, policy_config): policy_map = {} - for name, (cls, obs_space, act_space, conf) in sorted( - policy_dict.items()): + for name, (cls, obs_space, act_space, + conf) in sorted(policy_dict.items()): merged_conf = policy_config.copy() merged_conf.update(conf) with tf.variable_scope(name): @@ -315,7 +329,8 @@ class PolicyEvaluator(EvaluatorInterface): def get_weights(self): return { pid: policy.get_weights() - for pid, policy in self.policy_map.items()} + for pid, policy in self.policy_map.items() + } def set_weights(self, weights): for pid, w in weights.items(): @@ -351,9 +366,7 @@ class PolicyEvaluator(EvaluatorInterface): builder, grad) for pid, grad in grads.items() } - return { - k: builder.get(v) for k, v in outputs.items() - } + return {k: builder.get(v) for k, v in outputs.items()} else: return { pid: self.policy_map[pid].apply_gradients(g) @@ -428,8 +441,9 @@ def _validate_and_canonicalize(policy_graph, env): raise ValueError("policy_graph must be a rllib.PolicyGraph class") else: return { - DEFAULT_POLICY_ID: ( - policy_graph, env.observation_space, env.action_space, {})} + DEFAULT_POLICY_ID: (policy_graph, env.observation_space, + env.action_space, {}) + } def _has_tensorflow_graph(policy_dict): diff --git a/python/ray/rllib/evaluation/sample_batch.py b/python/ray/rllib/evaluation/sample_batch.py index 14584b41f..109db4d3f 100644 --- a/python/ray/rllib/evaluation/sample_batch.py +++ b/python/ray/rllib/evaluation/sample_batch.py @@ -45,7 +45,8 @@ class SampleBatchBuilder(object): """Returns a sample batch including all previously added values.""" batch = SampleBatch( - {k: to_float_array(v) for k, v in self.buffers.items()}) + {k: to_float_array(v) + for k, v in self.buffers.items()}) self.buffers.clear() self.count = 0 return batch @@ -69,7 +70,9 @@ class MultiAgentSampleBatchBuilder(object): self.policy_map = policy_map self.policy_builders = { - k: SampleBatchBuilder() for k in policy_map.keys()} + k: SampleBatchBuilder() + for k in policy_map.keys() + } self.agent_builders = {} self.agent_to_policy = {} self.count = 0 # increment this manually diff --git a/python/ray/rllib/evaluation/sampler.py b/python/ray/rllib/evaluation/sampler.py index 4ea09652c..6ae66e6da 100644 --- a/python/ray/rllib/evaluation/sampler.py +++ b/python/ray/rllib/evaluation/sampler.py @@ -12,12 +12,11 @@ from ray.rllib.evaluation.sample_batch import MultiAgentSampleBatchBuilder, \ from ray.rllib.env.async_vector_env import AsyncVectorEnv from ray.rllib.utils.tf_run_builder import TFRunBuilder - RolloutMetrics = namedtuple( "RolloutMetrics", ["episode_length", "episode_reward", "agent_rewards"]) -PolicyEvalData = namedtuple( - "PolicyEvalData", ["env_id", "agent_id", "obs", "rnn_state"]) +PolicyEvalData = namedtuple("PolicyEvalData", + ["env_id", "agent_id", "obs", "rnn_state"]) class SyncSampler(object): @@ -29,9 +28,15 @@ class SyncSampler(object): This class provides data on invocation, rather than on a separate thread.""" - def __init__( - self, env, policies, policy_mapping_fn, obs_filters, - num_local_steps, horizon=None, pack=False, tf_sess=None): + def __init__(self, + env, + policies, + policy_mapping_fn, + obs_filters, + num_local_steps, + horizon=None, + pack=False, + tf_sess=None): self.async_vector_env = AsyncVectorEnv.wrap_async(env) self.num_local_steps = num_local_steps self.horizon = horizon @@ -68,9 +73,15 @@ class AsyncSampler(threading.Thread): Note that batch_size is only a unit of measure here. Batches can accumulate and the gradient can be calculated on up to 5 batches.""" - def __init__( - self, env, policies, policy_mapping_fn, obs_filters, - num_local_steps, horizon=None, pack=False, tf_sess=None): + def __init__(self, + env, + policies, + policy_mapping_fn, + obs_filters, + num_local_steps, + horizon=None, + pack=False, + tf_sess=None): for _, f in obs_filters.items(): assert getattr(f, "is_concurrent", False), \ "Observation Filter must support concurrent updates." @@ -142,9 +153,14 @@ class AsyncSampler(threading.Thread): return completed -def _env_runner( - async_vector_env, policies, policy_mapping_fn, num_local_steps, - horizon, obs_filters, pack, tf_sess=None): +def _env_runner(async_vector_env, + policies, + policy_mapping_fn, + num_local_steps, + horizon, + obs_filters, + pack, + tf_sess=None): """This implements the common experience collection logic. Args: @@ -186,9 +202,11 @@ def _env_runner( else: return MultiAgentSampleBatchBuilder(policies) - active_episodes = defaultdict( - lambda: _MultiAgentEpisode( - policies, policy_mapping_fn, get_batch_builder)) + def new_episode(): + return _MultiAgentEpisode(policies, policy_mapping_fn, + get_batch_builder) + + active_episodes = defaultdict(new_episode) while True: # Get observations from all ready agents @@ -213,9 +231,8 @@ def _env_runner( # Check episode termination conditions if dones[env_id]["__all__"] or episode.length >= horizon: all_done = True - yield RolloutMetrics( - episode.length, episode.total_reward, - dict(episode.agent_rewards)) + yield RolloutMetrics(episode.length, episode.total_reward, + dict(episode.agent_rewards)) else: all_done = False # At least send an empty dict if not done @@ -228,9 +245,8 @@ def _env_runner( agent_done = bool(all_done or dones[env_id].get(agent_id)) if not agent_done: to_eval[policy_id].append( - PolicyEvalData( - env_id, agent_id, filtered_obs, - episode.rnn_state_for(agent_id))) + PolicyEvalData(env_id, agent_id, filtered_obs, + episode.rnn_state_for(agent_id))) last_observation = episode.last_observation_for(agent_id) episode.set_last_observation(agent_id, filtered_obs) @@ -274,13 +290,12 @@ def _env_runner( episode = active_episodes[env_id] for agent_id, raw_obs in resetted_obs.items(): policy_id = episode.policy_for(agent_id) - filtered_obs = _get_or_raise( - obs_filters, policy_id)(raw_obs) + filtered_obs = _get_or_raise(obs_filters, + policy_id)(raw_obs) episode.set_last_observation(agent_id, filtered_obs) to_eval[policy_id].append( - PolicyEvalData( - env_id, agent_id, filtered_obs, - episode.rnn_state_for(agent_id))) + PolicyEvalData(env_id, agent_id, filtered_obs, + episode.rnn_state_for(agent_id))) # Batch eval policy actions if possible if tf_sess: @@ -295,7 +310,8 @@ def _env_runner( policy = _get_or_raise(policies, policy_id) if builder: eval_results[policy_id] = policy.build_compute_actions( - builder, [t.obs for t in eval_data], rnn_in, + builder, [t.obs for t in eval_data], + rnn_in, is_training=True) else: eval_results[policy_id] = policy.compute_actions( @@ -319,7 +335,8 @@ def _env_runner( episode = active_episodes[env_id] episode.set_rnn_state(agent_id, [c[i] for c in rnn_out_cols]) episode.set_last_pi_info( - agent_id, {k: v[i] for k, v in pi_info_cols.items()}) + agent_id, {k: v[i] + for k, v in pi_info_cols.items()}) if env_id in off_policy_actions and \ agent_id in off_policy_actions[env_id]: episode.set_last_action( @@ -334,8 +351,7 @@ def _env_runner( def _to_column_format(rnn_state_rows): num_cols = len(rnn_state_rows[0]) - return [ - [row[i] for row in rnn_state_rows] for i in range(num_cols)] + return [[row[i] for row in rnn_state_rows] for i in range(num_cols)] def _get_or_raise(mapping, policy_id): @@ -363,8 +379,8 @@ class _MultiAgentEpisode(object): def add_agent_rewards(self, reward_dict): for agent_id, reward in reward_dict.items(): if reward is not None: - self.agent_rewards[ - agent_id, self.policy_for(agent_id)] += reward + self.agent_rewards[agent_id, + self.policy_for(agent_id)] += reward self.total_reward += reward def policy_for(self, agent_id): diff --git a/python/ray/rllib/evaluation/tf_policy_graph.py b/python/ray/rllib/evaluation/tf_policy_graph.py index 58f5e3cac..d7225d7a4 100644 --- a/python/ray/rllib/evaluation/tf_policy_graph.py +++ b/python/ray/rllib/evaluation/tf_policy_graph.py @@ -35,10 +35,18 @@ class TFPolicyGraph(PolicyGraph): SampleBatch({"action": ..., "advantages": ..., ...}) """ - def __init__( - self, observation_space, action_space, sess, obs_input, - action_sampler, loss, loss_inputs, state_inputs=None, - state_outputs=None, seq_lens=None, max_seq_len=20): + def __init__(self, + observation_space, + action_space, + sess, + obs_input, + action_sampler, + loss, + loss_inputs, + state_inputs=None, + state_outputs=None, + seq_lens=None, + max_seq_len=20): """Initialize the policy graph. Arguments: @@ -78,9 +86,9 @@ class TFPolicyGraph(PolicyGraph): self._seq_lens = seq_lens self._max_seq_len = max_seq_len self._optimizer = self.optimizer() - self._grads_and_vars = [ - (g, v) for (g, v) in self.gradients(self._optimizer) - if g is not None] + self._grads_and_vars = [(g, v) + for (g, v) in self.gradients(self._optimizer) + if g is not None] self._grads = [g for (g, v) in self._grads_and_vars] self._apply_op = self._optimizer.apply_gradients(self._grads_and_vars) self._variables = ray.experimental.TensorFlowVariables( @@ -92,8 +100,11 @@ class TFPolicyGraph(PolicyGraph): if self._state_inputs: assert self._seq_lens is not None - def build_compute_actions( - self, builder, obs_batch, state_batches=None, is_training=False): + def build_compute_actions(self, + builder, + obs_batch, + state_batches=None, + is_training=False): state_batches = state_batches or [] assert len(self._state_inputs) == len(state_batches), \ (self._state_inputs, state_batches) @@ -103,16 +114,15 @@ class TFPolicyGraph(PolicyGraph): builder.add_feed_dict({self._seq_lens: np.ones(len(obs_batch))}) builder.add_feed_dict({self._is_training: is_training}) builder.add_feed_dict(dict(zip(self._state_inputs, state_batches))) - fetches = builder.add_fetches( - [self._sampler] + self._state_outputs + - [self.extra_compute_action_fetches()]) + fetches = builder.add_fetches([self._sampler] + self._state_outputs + + [self.extra_compute_action_fetches()]) return fetches[0], fetches[1:-1], fetches[-1] - def compute_actions( - self, obs_batch, state_batches=None, is_training=False): + def compute_actions(self, obs_batch, state_batches=None, + is_training=False): builder = TFRunBuilder(self._sess, "compute_actions") - fetches = self.build_compute_actions( - builder, obs_batch, state_batches, is_training) + fetches = self.build_compute_actions(builder, obs_batch, state_batches, + is_training) return builder.get(fetches) def _get_loss_inputs_dict(self, batch): @@ -127,12 +137,11 @@ class TFPolicyGraph(PolicyGraph): # RNN case feature_keys = [k for k, v in self._loss_inputs] state_keys = [ - "state_in_{}".format(i) for i in range(len(self._state_inputs))] + "state_in_{}".format(i) for i in range(len(self._state_inputs)) + ] feature_sequences, initial_states, seq_lens = chop_into_sequences( - batch["t"], - [batch[k] for k in feature_keys], - [batch[k] for k in state_keys], - self._max_seq_len) + batch["t"], [batch[k] for k in feature_keys], + [batch[k] for k in state_keys], self._max_seq_len) for k, v in zip(feature_keys, feature_sequences): feed_dict[self._loss_input_dict[k]] = v for k, v in zip(state_keys, initial_states): @@ -172,9 +181,11 @@ class TFPolicyGraph(PolicyGraph): builder.add_feed_dict(self.extra_apply_grad_feed_dict()) builder.add_feed_dict(self._get_loss_inputs_dict(postprocessed_batch)) builder.add_feed_dict({self._is_training: True}) - fetches = builder.add_fetches( - [self._apply_op, self.extra_compute_grad_fetches(), - self.extra_apply_grad_fetches()]) + fetches = builder.add_fetches([ + self._apply_op, + self.extra_compute_grad_fetches(), + self.extra_apply_grad_fetches() + ]) return fetches[1], fetches[2] def compute_apply(self, postprocessed_batch): diff --git a/python/ray/rllib/evaluation/torch_policy_graph.py b/python/ray/rllib/evaluation/torch_policy_graph.py index 778eeff2e..069ca2244 100644 --- a/python/ray/rllib/evaluation/torch_policy_graph.py +++ b/python/ray/rllib/evaluation/torch_policy_graph.py @@ -27,8 +27,8 @@ class TorchPolicyGraph(PolicyGraph): This is necessary when using the async sampler. """ - def __init__( - self, observation_space, action_space, model, loss, loss_inputs): + def __init__(self, observation_space, action_space, model, loss, + loss_inputs): """Build a policy graph from policy and loss torch modules. Note that module inputs will be CPU tensors. The model and loss modules @@ -67,8 +67,8 @@ class TorchPolicyGraph(PolicyGraph): """Custom PyTorch optimizer to use.""" return torch.optim.Adam(self._model.parameters()) - def compute_actions( - self, obs_batch, state_batches=None, is_training=False): + def compute_actions(self, obs_batch, state_batches=None, + is_training=False): if state_batches: raise NotImplementedError("Torch RNN support") with self.lock: diff --git a/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar.py b/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar.py index 1e97264a5..4e01bbc77 100644 --- a/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar.py +++ b/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar.py @@ -20,13 +20,12 @@ def pass_params_to_gym(env_name): global env_version_num register( - id=env_name, - entry_point=( - "ray.rllib.examples.legacy_multiagent.multiagent_mountaincar_env:" - "MultiAgentMountainCarEnv"), - max_episode_steps=200, - kwargs={} - ) + id=env_name, + entry_point=( + "ray.rllib.examples.legacy_multiagent.multiagent_mountaincar_env:" + "MultiAgentMountainCarEnv"), + max_episode_steps=200, + kwargs={}) def create_env(env_config): @@ -48,10 +47,12 @@ if __name__ == '__main__': config["horizon"] = horizon config["use_gae"] = False config["model"].update({"fcnet_hiddens": [256, 256]}) - options = {"multiagent_obs_shapes": [2, 2], - "multiagent_act_shapes": [1, 1], - "multiagent_shared_model": False, - "multiagent_fcnet_hiddens": [[32, 32]] * 2} + options = { + "multiagent_obs_shapes": [2, 2], + "multiagent_act_shapes": [1, 1], + "multiagent_shared_model": False, + "multiagent_fcnet_hiddens": [[32, 32]] * 2 + } config["model"].update({"custom_options": options}) alg = ppo.PPOAgent(env=env_name, config=config) for i in range(1): diff --git a/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar_env.py b/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar_env.py index d454937ac..c120f00c9 100644 --- a/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar_env.py +++ b/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar_env.py @@ -2,7 +2,6 @@ from math import cos from gym.spaces import Box, Tuple, Discrete import numpy as np from gym.envs.classic_control.mountain_car import MountainCarEnv - """ Multiagent mountain car that sums and then averages its actions to produce the velocity @@ -22,8 +21,8 @@ class MultiAgentMountainCarEnv(MountainCarEnv): self.viewer = None self.action_space = [Discrete(3) for _ in range(2)] - self.observation_space = Tuple([ - Box(self.low, self.high, dtype=np.float32) for _ in range(2)]) + self.observation_space = Tuple( + [Box(self.low, self.high, dtype=np.float32) for _ in range(2)]) self.seed() self.reset() diff --git a/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum.py b/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum.py index c78b5d601..098ad6954 100644 --- a/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum.py +++ b/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum.py @@ -20,13 +20,12 @@ def pass_params_to_gym(env_name): global env_version_num register( - id=env_name, - entry_point=( - "ray.rllib.examples.legacy_multiagent.multiagent_pendulum_env:" - "MultiAgentPendulumEnv"), - max_episode_steps=100, - kwargs={} - ) + id=env_name, + entry_point=( + "ray.rllib.examples.legacy_multiagent.multiagent_pendulum_env:" + "MultiAgentPendulumEnv"), + max_episode_steps=100, + kwargs={}) def create_env(env_config): @@ -49,10 +48,12 @@ if __name__ == '__main__': config["horizon"] = horizon config["use_gae"] = True config["model"].update({"fcnet_hiddens": [256, 256]}) - options = {"multiagent_obs_shapes": [3, 3], - "multiagent_act_shapes": [1, 1], - "multiagent_shared_model": True, - "multiagent_fcnet_hiddens": [[32, 32]] * 2} + options = { + "multiagent_obs_shapes": [3, 3], + "multiagent_act_shapes": [1, 1], + "multiagent_shared_model": True, + "multiagent_fcnet_hiddens": [[32, 32]] * 2 + } config["model"].update({"custom_options": options}) alg = ppo.PPOAgent(env=env_name, config=config) for i in range(1): diff --git a/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum_env.py b/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum_env.py index 44c86f4e6..026458327 100644 --- a/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum_env.py +++ b/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum_env.py @@ -2,7 +2,6 @@ from gym.spaces import Box, Tuple from gym.utils import seeding from gym.envs.classic_control.pendulum import PendulumEnv import numpy as np - """ Multiagent pendulum that sums its torques to generate an action """ @@ -10,8 +9,8 @@ import numpy as np class MultiAgentPendulumEnv(PendulumEnv): metadata = { - 'render.modes': ['human', 'rgb_array'], - 'video.frames_per_second': 30 + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 30 } def __init__(self): @@ -21,13 +20,14 @@ class MultiAgentPendulumEnv(PendulumEnv): self.viewer = None high = np.array([1., 1., self.max_speed]) - self.action_space = [Box(low=-self.max_torque / 2, - high=self.max_torque / 2, - shape=(1,), - dtype=np.float32) - for _ in range(2)] - self.observation_space = Tuple([ - Box(low=-high, high=high, dtype=np.float32) for _ in range(2)]) + self.action_space = [ + Box(low=-self.max_torque / 2, + high=self.max_torque / 2, + shape=(1, ), + dtype=np.float32) for _ in range(2) + ] + self.observation_space = Tuple( + [Box(low=-high, high=high, dtype=np.float32) for _ in range(2)]) self.seed() @@ -49,8 +49,8 @@ class MultiAgentPendulumEnv(PendulumEnv): costs = self.angle_normalize(th) ** 2 + .1 * thdot ** 2 + \ .001 * (summed_u ** 2) - newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) + - 3. / (m * length ** 2) * summed_u) * dt + newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) + 3. / + (m * length**2) * summed_u) * dt newth = th + newthdot * dt newthdot = np.clip(newthdot, -self.max_speed, self.max_speed) @@ -65,8 +65,10 @@ class MultiAgentPendulumEnv(PendulumEnv): def _get_obs(self): theta, thetadot = self.state - return [np.array([np.cos(theta), np.sin(theta), thetadot]) - for _ in range(2)] + return [ + np.array([np.cos(theta), np.sin(theta), thetadot]) + for _ in range(2) + ] def angle_normalize(self, x): return (((x + np.pi) % (2 * np.pi)) - np.pi) diff --git a/python/ray/rllib/examples/multiagent_cartpole.py b/python/ray/rllib/examples/multiagent_cartpole.py index 75c678c53..767bf84aa 100644 --- a/python/ray/rllib/examples/multiagent_cartpole.py +++ b/python/ray/rllib/examples/multiagent_cartpole.py @@ -1,7 +1,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function - """Simple example of setting up a multi-agent policy mapping. Control the number of agents and policies via --num-agents and --num-policies. @@ -24,14 +23,12 @@ from ray.rllib.test.test_multi_agent_env import MultiCartpole from ray.tune.logger import pretty_print from ray.tune.registry import register_env - parser = argparse.ArgumentParser() parser.add_argument("--num-agents", type=int, default=4) parser.add_argument("--num-policies", type=int, default=2) parser.add_argument("--num-iters", type=int, default=20) - if __name__ == "__main__": args = parser.parse_args() ray.init() @@ -51,7 +48,8 @@ if __name__ == "__main__": # Setup PG with an ensemble of `num_policies` different policy graphs policy_graphs = { - "policy_{}".format(i): gen_policy() for i in range(args.num_policies) + "policy_{}".format(i): gen_policy() + for i in range(args.num_policies) } policy_ids = list(policy_graphs.keys()) diff --git a/python/ray/rllib/examples/serving/cartpole_client.py b/python/ray/rllib/examples/serving/cartpole_client.py index fb27e8567..6f6a2e189 100755 --- a/python/ray/rllib/examples/serving/cartpole_client.py +++ b/python/ray/rllib/examples/serving/cartpole_client.py @@ -1,7 +1,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function - """Example of querying a policy server. Copy this file for your use case. To try this out, in two separate shells run: @@ -14,18 +13,19 @@ import gym from ray.rllib.utils.policy_client import PolicyClient - parser = argparse.ArgumentParser() parser.add_argument( "--no-train", action="store_true", help="Whether to disable training.") parser.add_argument( - "--off-policy", action="store_true", + "--off-policy", + action="store_true", help="Whether to take random instead of on-policy actions.") parser.add_argument( - "--stop-at-reward", type=int, default=9999, + "--stop-at-reward", + type=int, + default=9999, help="Stop once the specified reward is reached.") - if __name__ == "__main__": args = parser.parse_args() env = gym.make("CartPole-v0") diff --git a/python/ray/rllib/examples/serving/cartpole_server.py b/python/ray/rllib/examples/serving/cartpole_server.py index 7e6d79996..a64ce03e6 100755 --- a/python/ray/rllib/examples/serving/cartpole_server.py +++ b/python/ray/rllib/examples/serving/cartpole_server.py @@ -1,7 +1,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function - """Example of running a policy server. Copy this file for your use case. To try this out, in two separate shells run: @@ -26,12 +25,12 @@ CHECKPOINT_FILE = "last_checkpoint.out" class CartpoleServing(ServingEnv): def __init__(self): - ServingEnv.__init__( - self, spaces.Discrete(2), spaces.Box(low=-10, high=10, shape=(4,))) + ServingEnv.__init__(self, spaces.Discrete(2), + spaces.Box(low=-10, high=10, shape=(4, ))) def run(self): - print("Starting policy server at {}:{}".format( - SERVER_ADDRESS, SERVER_PORT)) + print("Starting policy server at {}:{}".format(SERVER_ADDRESS, + SERVER_PORT)) server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT) server.serve_forever() @@ -42,14 +41,16 @@ if __name__ == "__main__": # We use DQN since it supports off-policy actions, but you can choose and # configure any agent. - dqn = DQNAgent(env="srv", config={ - # Use a single process to avoid needing to set up a load balancer - "num_workers": 0, - # Configure the agent to run short iterations for debugging - "exploration_fraction": 0.01, - "learning_starts": 100, - "timesteps_per_iteration": 200, - }) + dqn = DQNAgent( + env="srv", + config={ + # Use a single process to avoid needing to set up a load balancer + "num_workers": 0, + # Configure the agent to run short iterations for debugging + "exploration_fraction": 0.01, + "learning_starts": 100, + "timesteps_per_iteration": 200, + }) # Attempt to restore from checkpoint if possible. if os.path.exists(CHECKPOINT_FILE): diff --git a/python/ray/rllib/models/__init__.py b/python/ray/rllib/models/__init__.py index 91c2381f0..ddfdd16b8 100644 --- a/python/ray/rllib/models/__init__.py +++ b/python/ray/rllib/models/__init__.py @@ -6,7 +6,7 @@ from ray.rllib.models.preprocessors import Preprocessor from ray.rllib.models.fcnet import FullyConnectedNetwork from ray.rllib.models.lstm import LSTM - -__all__ = ["ActionDistribution", "Categorical", - "DiagGaussian", "Deterministic", "ModelCatalog", "Model", - "Preprocessor", "FullyConnectedNetwork", "LSTM"] +__all__ = [ + "ActionDistribution", "Categorical", "DiagGaussian", "Deterministic", + "ModelCatalog", "Model", "Preprocessor", "FullyConnectedNetwork", "LSTM" +] diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py index c4de85004..a88f5fa3a 100644 --- a/python/ray/rllib/models/action_dist.py +++ b/python/ray/rllib/models/action_dist.py @@ -42,25 +42,25 @@ class Categorical(ActionDistribution): logits=self.inputs, labels=x) def entropy(self): - a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1], - keepdims=True) + a0 = self.inputs - tf.reduce_max( + self.inputs, reduction_indices=[1], keepdims=True) ea0 = tf.exp(a0) z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True) p0 = ea0 / z0 return tf.reduce_sum(p0 * (tf.log(z0) - a0), reduction_indices=[1]) def kl(self, other): - a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1], - keepdims=True) - a1 = other.inputs - tf.reduce_max(other.inputs, reduction_indices=[1], - keepdims=True) + a0 = self.inputs - tf.reduce_max( + self.inputs, reduction_indices=[1], keepdims=True) + a1 = other.inputs - tf.reduce_max( + other.inputs, reduction_indices=[1], keepdims=True) ea0 = tf.exp(a0) ea1 = tf.exp(a1) z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True) z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True) p0 = ea0 / z0 - return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), - reduction_indices=[1]) + return tf.reduce_sum( + p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), reduction_indices=[1]) def sample(self): return tf.squeeze(tf.multinomial(self.inputs, 1), axis=1) @@ -90,22 +90,23 @@ class DiagGaussian(ActionDistribution): self.std = tf.exp(log_std) def logp(self, x): - return (-0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), - reduction_indices=[1]) - + return (-0.5 * tf.reduce_sum( + tf.square((x - self.mean) / self.std), reduction_indices=[1]) - 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) - tf.reduce_sum(self.log_std, reduction_indices=[1])) def kl(self, other): assert isinstance(other, DiagGaussian) - return tf.reduce_sum(other.log_std - self.log_std + - (tf.square(self.std) + - tf.square(self.mean - other.mean)) / - (2.0 * tf.square(other.std)) - 0.5, - reduction_indices=[1]) + return tf.reduce_sum( + other.log_std - self.log_std + + (tf.square(self.std) + tf.square(self.mean - other.mean)) / + (2.0 * tf.square(other.std)) - 0.5, + reduction_indices=[1]) def entropy(self): - return tf.reduce_sum(self.log_std + .5 * np.log(2.0 * np.pi * np.e), - reduction_indices=[1]) + return tf.reduce_sum( + self.log_std + .5 * np.log(2.0 * np.pi * np.e), + reduction_indices=[1]) def sample(self): out = self.mean + self.std * tf.random_normal(tf.shape(self.mean)) @@ -158,6 +159,7 @@ class MultiActionDistribution(ActionDistribution): Args: inputs (Tensor list): A list of tensors from which to compute samples. """ + def __init__(self, inputs, action_space, child_distributions): # you actually have to instantiate the child distributions self.reshaper = Reshaper(action_space.spaces) @@ -174,23 +176,25 @@ class MultiActionDistribution(ActionDistribution): # Remove extra categorical dimension if isinstance(distribution, Categorical): split_list[i] = tf.squeeze(split_list[i], axis=-1) - log_list = np.asarray([distribution.logp(split_x) for - distribution, split_x in - zip(self.child_distributions, split_list)]) + log_list = np.asarray([ + distribution.logp(split_x) for distribution, split_x in zip( + self.child_distributions, split_list) + ]) return np.sum(log_list) def kl(self, other): """The KL-divergence between two action distributions.""" - kl_list = np.asarray([distribution.kl(other_distribution) for - distribution, other_distribution in - zip(self.child_distributions, - other.child_distributions)]) + kl_list = np.asarray([ + distribution.kl(other_distribution) + for distribution, other_distribution in zip( + self.child_distributions, other.child_distributions) + ]) return np.sum(kl_list) def entropy(self): """The entropy of the action distribution.""" - entropy_list = np.array([s.entropy() for s in - self.child_distributions]) + entropy_list = np.array( + [s.entropy() for s in self.child_distributions]) return np.sum(entropy_list) def sample(self): diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py index db5717a83..b611d2e31 100644 --- a/python/ray/rllib/models/catalog.py +++ b/python/ray/rllib/models/catalog.py @@ -19,7 +19,6 @@ from ray.rllib.models.visionnet import VisionNetwork from ray.rllib.models.lstm import LSTM from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork - MODEL_CONFIGS = [ # === Built-in options === "conv_filters", # Filter configuration @@ -30,11 +29,9 @@ MODEL_CONFIGS = [ "grayscale", # Converts ATARI frame to 1 Channel Grayscale image "zero_mean", # Changes frame to range from [-1, 1] if true "extra_frameskip", # (int) for number of frames to skip - "free_log_std", # Documented in ray.rllib.models.Model "channel_major", # Pytorch conv requires images to be channel-major "squash_to_range", # Whether to squash the action output to space range - "use_lstm", # Whether to wrap the model with a LSTM "max_seq_len", # Max seq len for training the LSTM, defaults to 20 "lstm_cell_size", # Size of the LSTM cell @@ -81,8 +78,8 @@ class ModelCatalog(object): if dist_type is None: dist = DiagGaussian if config.get("squash_to_range"): - dist = squash_to_range( - dist, action_space.low, action_space.high) + dist = squash_to_range(dist, action_space.low, + action_space.high) return dist, action_space.shape[0] * 2 elif dist_type == 'deterministic': return Deterministic, action_space.shape[0] @@ -95,12 +92,13 @@ class ModelCatalog(object): dist, action_size = ModelCatalog.get_action_dist(action) child_dist.append(dist) size += action_size - return partial(MultiActionDistribution, - child_distributions=child_dist, - action_space=action_space), size + return partial( + MultiActionDistribution, + child_distributions=child_dist, + action_space=action_space), size - raise NotImplementedError( - "Unsupported args: {} {}".format(action_space, dist_type)) + raise NotImplementedError("Unsupported args: {} {}".format( + action_space, dist_type)) @staticmethod def get_action_placeholder(action_space): @@ -120,7 +118,7 @@ class ModelCatalog(object): return tf.placeholder( tf.float32, shape=(None, action_space.shape[0]), name="action") elif isinstance(action_space, gym.spaces.Discrete): - return tf.placeholder(tf.int64, shape=(None,), name="action") + return tf.placeholder(tf.int64, shape=(None, ), name="action") elif isinstance(action_space, gym.spaces.Tuple): size = 0 all_discrete = True @@ -131,15 +129,19 @@ class ModelCatalog(object): all_discrete = False size += np.product(action_space.spaces[i].shape) return tf.placeholder( - tf.int64 if all_discrete else tf.float32, shape=(None, size), + tf.int64 if all_discrete else tf.float32, + shape=(None, size), name="action") else: raise NotImplementedError("action space {}" " not supported".format(action_space)) @staticmethod - def get_model( - inputs, num_outputs, options=None, state_in=None, seq_lens=None): + def get_model(inputs, + num_outputs, + options=None, + state_in=None, + seq_lens=None): """Returns a suitable model conforming to given input and output specs. Args: @@ -154,12 +156,12 @@ class ModelCatalog(object): """ options = options or {} - model = ModelCatalog._get_model( - inputs, num_outputs, options, state_in, seq_lens) + model = ModelCatalog._get_model(inputs, num_outputs, options, state_in, + seq_lens) if options.get("use_lstm"): - model = LSTM( - model.last_layer, num_outputs, options, state_in, seq_lens) + model = LSTM(model.last_layer, num_outputs, options, state_in, + seq_lens) return model @@ -169,16 +171,20 @@ class ModelCatalog(object): model = options["custom_model"] print("Using custom model {}".format(model)) return _global_registry.get(RLLIB_MODEL, model)( - inputs, num_outputs, options, - state_in=state_in, seq_lens=seq_lens) + inputs, + num_outputs, + options, + state_in=state_in, + seq_lens=seq_lens) obs_rank = len(inputs.shape) - 1 # num_outputs > 1 used to avoid hitting this with the value function - if isinstance(options.get("custom_options", {}).get( - "multiagent_fcnet_hiddens", 1), list) and num_outputs > 1: - return MultiAgentFullyConnectedNetwork( - inputs, num_outputs, options) + if isinstance( + options.get("custom_options", {}).get( + "multiagent_fcnet_hiddens", 1), list) and num_outputs > 1: + return MultiAgentFullyConnectedNetwork(inputs, num_outputs, + options) if obs_rank > 1: return VisionNetwork(inputs, num_outputs, options) @@ -198,10 +204,10 @@ class ModelCatalog(object): Returns: model (Model): Neural network model. """ - from ray.rllib.models.pytorch.fcnet import ( - FullyConnectedNetwork as PyTorchFCNet) - from ray.rllib.models.pytorch.visionnet import ( - VisionNetwork as PyTorchVisionNet) + from ray.rllib.models.pytorch.fcnet import (FullyConnectedNetwork as + PyTorchFCNet) + from ray.rllib.models.pytorch.visionnet import (VisionNetwork as + PyTorchVisionNet) if "custom_model" in options: model = options["custom_model"] @@ -232,9 +238,8 @@ class ModelCatalog(object): """ for k in options.keys(): if k not in MODEL_CONFIGS: - raise Exception( - "Unknown config key `{}`, all keys: {}".format( - k, MODEL_CONFIGS)) + raise Exception("Unknown config key `{}`, all keys: {}".format( + k, MODEL_CONFIGS)) if "custom_preprocessor" in options: preprocessor = options["custom_preprocessor"] @@ -271,8 +276,8 @@ class ModelCatalog(object): preprocessor_name (str): Name to register the preprocessor under. preprocessor_class (type): Python class of the preprocessor. """ - _global_registry.register( - RLLIB_PREPROCESSOR, preprocessor_name, preprocessor_class) + _global_registry.register(RLLIB_PREPROCESSOR, preprocessor_name, + preprocessor_class) @staticmethod def register_custom_model(model_name, model_class): diff --git a/python/ray/rllib/models/fcnet.py b/python/ray/rllib/models/fcnet.py index 3f5bcabf6..11aee2c0d 100644 --- a/python/ray/rllib/models/fcnet.py +++ b/python/ray/rllib/models/fcnet.py @@ -22,14 +22,17 @@ class FullyConnectedNetwork(Model): for size in hiddens: label = "fc{}".format(i) last_layer = slim.fully_connected( - last_layer, size, + last_layer, + size, weights_initializer=normc_initializer(1.0), activation_fn=activation, scope=label) i += 1 label = "fc_out" output = slim.fully_connected( - last_layer, num_outputs, + last_layer, + num_outputs, weights_initializer=normc_initializer(0.01), - activation_fn=None, scope=label) + activation_fn=None, + scope=label) return output, last_layer diff --git a/python/ray/rllib/models/lstm.py b/python/ray/rllib/models/lstm.py index 55a9626cb..1365b5a69 100644 --- a/python/ray/rllib/models/lstm.py +++ b/python/ray/rllib/models/lstm.py @@ -1,7 +1,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function - """LSTM support for RLlib. The main trick here is that we add the time dimension at the last moment. @@ -14,7 +13,6 @@ See the add_time_dimension() and chop_into_sequences() functions below for more info. """ - import numpy as np import tensorflow as tf import tensorflow.contrib.rnn as rnn @@ -46,14 +44,13 @@ def add_time_dimension(padded_inputs, seq_lens): # Dynamically reshape the padded batch to introduce a time dimension. new_batch_size = padded_batch_size // max_seq_len - new_shape = ( - [new_batch_size, max_seq_len] + - padded_inputs.get_shape().as_list()[1:]) + new_shape = ([new_batch_size, max_seq_len] + + padded_inputs.get_shape().as_list()[1:]) return tf.reshape(padded_inputs, new_shape) -def chop_into_sequences( - time_column, feature_columns, state_columns, max_seq_len): +def chop_into_sequences(time_column, feature_columns, state_columns, + max_seq_len): """Truncate and pad experiences into fixed-length sequences. Arguments: @@ -106,7 +103,7 @@ def chop_into_sequences( feature_sequences = [] for f in feature_columns: f = np.array(f) - f_pad = np.zeros((len(seq_lens) * max_seq_len,) + np.shape(f)[1:]) + f_pad = np.zeros((len(seq_lens) * max_seq_len, ) + np.shape(f)[1:]) seq_base = 0 i = 0 for l in seq_lens: @@ -152,7 +149,8 @@ class LSTM(Model): lstm = rnn.rnn_cell.BasicLSTMCell(cell_size, state_is_tuple=True) self.state_init = [ np.zeros(lstm.state_size.c, np.float32), - np.zeros(lstm.state_size.h, np.float32)] + np.zeros(lstm.state_size.h, np.float32) + ] # Setup LSTM inputs if self.state_in: @@ -170,12 +168,15 @@ class LSTM(Model): else: state_in = rnn.rnn_cell.LSTMStateTuple(c_in, h_in) lstm_out, lstm_state = tf.nn.dynamic_rnn( - lstm, last_layer, initial_state=state_in, - sequence_length=self.seq_lens, time_major=False) + lstm, + last_layer, + initial_state=state_in, + sequence_length=self.seq_lens, + time_major=False) self.state_out = list(lstm_state) # Compute outputs last_layer = tf.reshape(lstm_out, [-1, cell_size]) - logits = linear( - last_layer, num_outputs, "action", normc_initializer(0.01)) + logits = linear(last_layer, num_outputs, "action", + normc_initializer(0.01)) return logits, last_layer diff --git a/python/ray/rllib/models/misc.py b/python/ray/rllib/models/misc.py index 461296ecd..aad399c3b 100644 --- a/python/ray/rllib/models/misc.py +++ b/python/ray/rllib/models/misc.py @@ -11,6 +11,7 @@ def normc_initializer(std=1.0): out = np.random.randn(*shape).astype(np.float32) out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) return tf.constant(out) + return _initializer @@ -18,12 +19,20 @@ def get_activation_fn(name): return getattr(tf.nn, name) -def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", - dtype=tf.float32, collections=None): +def conv2d(x, + num_filters, + name, + filter_size=(3, 3), + stride=(1, 1), + pad="SAME", + dtype=tf.float32, + collections=None): with tf.variable_scope(name): stride_shape = [1, stride[0], stride[1], 1] - filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), - num_filters] + filter_shape = [ + filter_size[0], filter_size[1], + int(x.get_shape()[3]), num_filters + ] # There are "num input feature maps * filter height * filter width" # inputs to each hidden unit. @@ -34,20 +43,24 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", # Initialize weights with random weights. w_bound = np.sqrt(6 / (fan_in + fan_out)) - w = tf.get_variable("W", filter_shape, dtype, - tf.random_uniform_initializer(-w_bound, w_bound), - collections=collections) - b = tf.get_variable("b", [1, 1, 1, num_filters], - initializer=tf.constant_initializer(0.0), - collections=collections) + w = tf.get_variable( + "W", + filter_shape, + dtype, + tf.random_uniform_initializer(-w_bound, w_bound), + collections=collections) + b = tf.get_variable( + "b", [1, 1, 1, num_filters], + initializer=tf.constant_initializer(0.0), + collections=collections) return tf.nn.conv2d(x, w, stride_shape, pad) + b def linear(x, size, name, initializer=None, bias_init=0): - w = tf.get_variable(name + "/w", [x.get_shape()[1], size], - initializer=initializer) - b = tf.get_variable(name + "/b", [size], - initializer=tf.constant_initializer(bias_init)) + w = tf.get_variable( + name + "/w", [x.get_shape()[1], size], initializer=initializer) + b = tf.get_variable( + name + "/b", [size], initializer=tf.constant_initializer(bias_init)) return tf.matmul(x, w) + b diff --git a/python/ray/rllib/models/model.py b/python/ray/rllib/models/model.py index 27206adaf..00d6575e6 100644 --- a/python/ray/rllib/models/model.py +++ b/python/ray/rllib/models/model.py @@ -37,8 +37,12 @@ class Model(object): a scale parameter (like a standard deviation). """ - def __init__( - self, inputs, num_outputs, options, state_in=None, seq_lens=None): + def __init__(self, + inputs, + num_outputs, + options, + state_in=None, + seq_lens=None): self.inputs = inputs # Default attribute values for the non-RNN case @@ -57,8 +61,10 @@ class Model(object): self.outputs, self.last_layer = self._build_layers( inputs, num_outputs, options) if options.get("free_log_std", False): - log_std = tf.get_variable(name="log_std", shape=[num_outputs], - initializer=tf.zeros_initializer) + log_std = tf.get_variable( + name="log_std", + shape=[num_outputs], + initializer=tf.zeros_initializer) self.outputs = tf.concat( [self.outputs, 0.0 * self.outputs + log_std], 1) diff --git a/python/ray/rllib/models/multiagentfcnet.py b/python/ray/rllib/models/multiagentfcnet.py index 81d9c8d15..d000e95df 100644 --- a/python/ray/rllib/models/multiagentfcnet.py +++ b/python/ray/rllib/models/multiagentfcnet.py @@ -23,7 +23,7 @@ class MultiAgentFullyConnectedNetwork(Model): custom_options = options["custom_options"] hiddens = custom_options.get("multiagent_fcnet_hiddens", - [[256, 256]]*1) + [[256, 256]] * 1) # check for a shared model shared_model = custom_options.get("multiagent_shared_model", 0) @@ -35,8 +35,8 @@ class MultiAgentFullyConnectedNetwork(Model): sub_options = options.copy() sub_options.update({"fcnet_hiddens": hiddens[i]}) # TODO(ev) make this support arbitrary networks - fcnet = FullyConnectedNetwork( - split_inputs[i], int(num_actions[i]), sub_options) + fcnet = FullyConnectedNetwork(split_inputs[i], + int(num_actions[i]), sub_options) output = fcnet.outputs outputs.append(output) overall_output = tf.concat(outputs, axis=1) diff --git a/python/ray/rllib/models/preprocessors.py b/python/ray/rllib/models/preprocessors.py index 050d8b5a2..46404ae08 100644 --- a/python/ray/rllib/models/preprocessors.py +++ b/python/ray/rllib/models/preprocessors.py @@ -6,7 +6,7 @@ import numpy as np import gym ATARI_OBS_SHAPE = (210, 160, 3) -ATARI_RAM_OBS_SHAPE = (128,) +ATARI_RAM_OBS_SHAPE = (128, ) class Preprocessor(object): @@ -70,7 +70,7 @@ class AtariPixelPreprocessor(Preprocessor): class AtariRamPreprocessor(Preprocessor): def _init(self): - self.shape = (128,) + self.shape = (128, ) def transform(self, observation): return (observation - 128) / 128 @@ -78,7 +78,7 @@ class AtariRamPreprocessor(Preprocessor): class OneHotPreprocessor(Preprocessor): def _init(self): - self.shape = (self._obs_space.n,) + self.shape = (self._obs_space.n, ) def transform(self, observation): arr = np.zeros(self._obs_space.n) @@ -111,13 +111,14 @@ class TupleFlatteningPreprocessor(Preprocessor): preprocessor = get_preprocessor(space)(space, self._options) self.preprocessors.append(preprocessor) size += np.product(preprocessor.shape) - self.shape = (size,) + self.shape = (size, ) def transform(self, observation): assert len(observation) == len(self.preprocessors), observation return np.concatenate([ np.reshape(p.transform(o), [np.product(p.shape)]) - for (o, p) in zip(observation, self.preprocessors)]) + for (o, p) in zip(observation, self.preprocessors) + ]) def get_preprocessor(space): diff --git a/python/ray/rllib/models/visionnet.py b/python/ray/rllib/models/visionnet.py index 893f7acd2..c3b633dbe 100644 --- a/python/ray/rllib/models/visionnet.py +++ b/python/ray/rllib/models/visionnet.py @@ -22,14 +22,27 @@ class VisionNetwork(Model): with tf.name_scope("vision_net"): for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1): inputs = slim.conv2d( - inputs, out_size, kernel, stride, - activation_fn=activation, scope="conv{}".format(i)) + inputs, + out_size, + kernel, + stride, + activation_fn=activation, + scope="conv{}".format(i)) out_size, kernel, stride = filters[-1] fc1 = slim.conv2d( - inputs, out_size, kernel, stride, - activation_fn=activation, padding="VALID", scope="fc1") - fc2 = slim.conv2d(fc1, num_outputs, [1, 1], activation_fn=None, - normalizer_fn=None, scope="fc2") + inputs, + out_size, + kernel, + stride, + activation_fn=activation, + padding="VALID", + scope="fc1") + fc2 = slim.conv2d( + fc1, + num_outputs, [1, 1], + activation_fn=None, + normalizer_fn=None, + scope="fc2") return flatten(fc2), flatten(fc1) diff --git a/python/ray/rllib/optimizers/__init__.py b/python/ray/rllib/optimizers/__init__.py index f8b530ff7..eadb38620 100644 --- a/python/ray/rllib/optimizers/__init__.py +++ b/python/ray/rllib/optimizers/__init__.py @@ -6,7 +6,6 @@ from ray.rllib.optimizers.sync_samples_optimizer import SyncSamplesOptimizer from ray.rllib.optimizers.sync_replay_optimizer import SyncReplayOptimizer from ray.rllib.optimizers.multi_gpu_optimizer import LocalMultiGPUOptimizer - __all__ = [ "PolicyOptimizer", "AsyncSamplesOptimizer", "AsyncGradientsOptimizer", "SyncSamplesOptimizer", "SyncReplayOptimizer", "LocalMultiGPUOptimizer" diff --git a/python/ray/rllib/optimizers/async_gradients_optimizer.py b/python/ray/rllib/optimizers/async_gradients_optimizer.py index 3c379782f..397fabba9 100644 --- a/python/ray/rllib/optimizers/async_gradients_optimizer.py +++ b/python/ray/rllib/optimizers/async_gradients_optimizer.py @@ -14,6 +14,7 @@ class AsyncGradientsOptimizer(PolicyOptimizer): evaluators, sending updated weights back as needed. This pipelines the gradient computations on the remote workers. """ + def _init(self, grads_per_step=100): self.apply_timer = TimerStat() self.wait_timer = TimerStat() @@ -55,8 +56,9 @@ class AsyncGradientsOptimizer(PolicyOptimizer): num_gradients += 1 def stats(self): - return dict(PolicyOptimizer.stats(self), **{ - "wait_time_ms": round(1000 * self.wait_timer.mean, 3), - "apply_time_ms": round(1000 * self.apply_timer.mean, 3), - "dispatch_time_ms": round(1000 * self.dispatch_timer.mean, 3), - }) + return dict( + PolicyOptimizer.stats(self), **{ + "wait_time_ms": round(1000 * self.wait_timer.mean, 3), + "apply_time_ms": round(1000 * self.apply_timer.mean, 3), + "dispatch_time_ms": round(1000 * self.dispatch_timer.mean, 3), + }) diff --git a/python/ray/rllib/optimizers/async_samples_optimizer.py b/python/ray/rllib/optimizers/async_samples_optimizer.py index dfc52e1d8..e37901c46 100644 --- a/python/ray/rllib/optimizers/async_samples_optimizer.py +++ b/python/ray/rllib/optimizers/async_samples_optimizer.py @@ -22,7 +22,6 @@ from ray.rllib.utils.actors import TaskPool, create_colocated from ray.rllib.utils.timer import TimerStat from ray.rllib.utils.window_stat import WindowStat - SAMPLE_QUEUE_DEPTH = 2 REPLAY_QUEUE_DEPTH = 4 LEARNER_QUEUE_MAX_SIZE = 16 @@ -35,10 +34,10 @@ class ReplayActor(object): Ray actors are single-threaded, so for scalability multiple replay actors may be created to increase parallelism.""" - def __init__( - self, num_shards, learning_starts, buffer_size, train_batch_size, - prioritized_replay_alpha, prioritized_replay_beta, - prioritized_replay_eps, clip_rewards): + def __init__(self, num_shards, learning_starts, buffer_size, + train_batch_size, prioritized_replay_alpha, + prioritized_replay_beta, prioritized_replay_eps, + clip_rewards): self.replay_starts = learning_starts // num_shards self.buffer_size = buffer_size // num_shards self.train_batch_size = train_batch_size @@ -46,7 +45,8 @@ class ReplayActor(object): self.prioritized_replay_eps = prioritized_replay_eps self.replay_buffer = PrioritizedReplayBuffer( - self.buffer_size, alpha=prioritized_replay_alpha, + self.buffer_size, + alpha=prioritized_replay_alpha, clip_rewards=clip_rewards) # Metrics @@ -60,38 +60,39 @@ class ReplayActor(object): def add_batch(self, batch): with self.add_batch_timer: for row in batch.rows(): - self.replay_buffer.add( - row["obs"], row["actions"], row["rewards"], row["new_obs"], - row["dones"], row["weights"]) + self.replay_buffer.add(row["obs"], row["actions"], + row["rewards"], row["new_obs"], + row["dones"], row["weights"]) def replay(self): with self.replay_timer: if len(self.replay_buffer) < self.replay_starts: return None - (obses_t, actions, rewards, obses_tp1, - dones, weights, batch_indexes) = self.replay_buffer.sample( - self.train_batch_size, - beta=self.prioritized_replay_beta) + (obses_t, actions, rewards, obses_tp1, dones, weights, + batch_indexes) = self.replay_buffer.sample( + self.train_batch_size, beta=self.prioritized_replay_beta) batch = SampleBatch({ - "obs": obses_t, "actions": actions, "rewards": rewards, - "new_obs": obses_tp1, "dones": dones, "weights": weights, - "batch_indexes": batch_indexes}) + "obs": obses_t, + "actions": actions, + "rewards": rewards, + "new_obs": obses_tp1, + "dones": dones, + "weights": weights, + "batch_indexes": batch_indexes + }) return batch def update_priorities(self, batch_indexes, td_errors): with self.update_priorities_timer: - new_priorities = ( - np.abs(td_errors) + self.prioritized_replay_eps) + new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps) self.replay_buffer.update_priorities(batch_indexes, new_priorities) def stats(self): stat = { - "add_batch_time_ms": round( - 1000 * self.add_batch_timer.mean, 3), - "replay_time_ms": round( - 1000 * self.replay_timer.mean, 3), + "add_batch_time_ms": round(1000 * self.add_batch_timer.mean, 3), + "replay_time_ms": round(1000 * self.replay_timer.mean, 3), "update_priorities_time_ms": round( 1000 * self.update_priorities_timer.mean, 3), } @@ -145,13 +146,19 @@ class AsyncSamplesOptimizer(PolicyOptimizer): "td_error" array in the info return of compute_gradients(). This error term will be used for sample prioritization.""" - def _init( - self, learning_starts=1000, buffer_size=10000, - prioritized_replay=True, prioritized_replay_alpha=0.6, - prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6, - train_batch_size=512, sample_batch_size=50, - num_replay_buffer_shards=1, max_weight_sync_delay=400, - clip_rewards=True, debug=False): + def _init(self, + learning_starts=1000, + buffer_size=10000, + prioritized_replay=True, + prioritized_replay_alpha=0.6, + prioritized_replay_beta=0.4, + prioritized_replay_eps=1e-6, + train_batch_size=512, + sample_batch_size=50, + num_replay_buffer_shards=1, + max_weight_sync_delay=400, + clip_rewards=True, + debug=False): self.debug = debug self.replay_starts = learning_starts @@ -164,18 +171,21 @@ class AsyncSamplesOptimizer(PolicyOptimizer): self.learner = LearnerThread(self.local_evaluator) self.learner.start() - self.replay_actors = create_colocated( - ReplayActor, - [num_replay_buffer_shards, learning_starts, buffer_size, - train_batch_size, prioritized_replay_alpha, - prioritized_replay_beta, prioritized_replay_eps, clip_rewards], - num_replay_buffer_shards) + self.replay_actors = create_colocated(ReplayActor, [ + num_replay_buffer_shards, learning_starts, buffer_size, + train_batch_size, prioritized_replay_alpha, + prioritized_replay_beta, prioritized_replay_eps, clip_rewards + ], num_replay_buffer_shards) assert len(self.remote_evaluators) > 0 # Stats - self.timers = {k: TimerStat() for k in [ - "put_weights", "get_samples", "enqueue", "sample_processing", - "replay_processing", "update_priorities", "train", "sample"]} + self.timers = { + k: TimerStat() + for k in [ + "put_weights", "get_samples", "enqueue", "sample_processing", + "replay_processing", "update_priorities", "train", "sample" + ] + } self.num_weight_syncs = 0 self.learning_started = False @@ -221,8 +231,8 @@ class AsyncSamplesOptimizer(PolicyOptimizer): sample_timesteps += self.sample_batch_size # Send the data to the replay buffer - random.choice(self.replay_actors).add_batch.remote( - sample_batch) + random.choice( + self.replay_actors).add_batch.remote(sample_batch) # Update weights if needed self.steps_since_update[ev] += self.sample_batch_size @@ -268,8 +278,8 @@ class AsyncSamplesOptimizer(PolicyOptimizer): timing["learner_dequeue_time_ms"] = round( 1000 * self.learner.queue_timer.mean, 3) stats = { - "sample_throughput": round( - self.timers["sample"].mean_throughput, 3), + "sample_throughput": round(self.timers["sample"].mean_throughput, + 3), "train_throughput": round(self.timers["train"].mean_throughput, 3), "num_weight_syncs": self.num_weight_syncs, } diff --git a/python/ray/rllib/optimizers/multi_gpu_impl.py b/python/ray/rllib/optimizers/multi_gpu_impl.py index 844dc11fb..7233e37e9 100644 --- a/python/ray/rllib/optimizers/multi_gpu_impl.py +++ b/python/ray/rllib/optimizers/multi_gpu_impl.py @@ -6,7 +6,6 @@ from collections import namedtuple import tensorflow as tf - # Variable scope in which created variables will be placed under TOWER_SCOPE_NAME = "tower" @@ -47,8 +46,14 @@ class LocalSyncParallelOptimizer(object): grad_norm_clipping: None or int stdev to clip grad norms by """ - def __init__(self, optimizer, devices, input_placeholders, rnn_inputs, - per_device_batch_size, build_graph, logdir, + def __init__(self, + optimizer, + devices, + input_placeholders, + rnn_inputs, + per_device_batch_size, + build_graph, + logdir, grad_norm_clipping=None): # TODO(rliaw): remove logdir self.optimizer = optimizer @@ -78,8 +83,8 @@ class LocalSyncParallelOptimizer(object): self._towers = [] for device, device_placeholders in zip(self.devices, data_splits): self._towers.append( - self._setup_device( - device, device_placeholders, len(input_placeholders))) + self._setup_device(device, device_placeholders, + len(input_placeholders))) avg = average_gradients([t.grads for t in self._towers]) if grad_norm_clipping: @@ -119,14 +124,10 @@ class LocalSyncParallelOptimizer(object): assert len(state_inputs[0]) * seq_len == len(inputs[0]) # Make sure the shorter state inputs arrays are evenly divisible state_inputs = [ - make_divisible_by(arr, self.batch_size) - for arr in state_inputs + make_divisible_by(arr, self.batch_size) for arr in state_inputs ] # Then truncate the data inputs to match - inputs = [ - arr[:len(state_inputs[0]) * seq_len] - for arr in inputs - ] + inputs = [arr[:len(state_inputs[0]) * seq_len] for arr in inputs] assert len(state_inputs[0]) * seq_len == len(inputs[0]) assert len(state_inputs[0]) % self.batch_size == 0 for ph, arr in zip(self.loss_inputs, inputs + state_inputs): @@ -138,8 +139,7 @@ class LocalSyncParallelOptimizer(object): feed_dict[ph] = truncated_arr truncated_len = len(truncated_arr) - sess.run( - [t.init_op for t in self._towers], feed_dict=feed_dict) + sess.run([t.init_op for t in self._towers], feed_dict=feed_dict) tuples_per_device = truncated_len / len(self.devices) assert tuples_per_device > 0, \ @@ -198,7 +198,9 @@ class LocalSyncParallelOptimizer(object): device_input_slices = [] for i, ph in enumerate(device_input_placeholders): current_batch = tf.Variable( - ph, trainable=False, validate_shape=False, + ph, + trainable=False, + validate_shape=False, collections=[]) device_input_batches.append(current_batch) if i < num_data_in: @@ -210,18 +212,17 @@ class LocalSyncParallelOptimizer(object): current_slice = tf.slice( current_batch, ([self._batch_index // scale * granularity] + - [0] * len(ph.shape[1:])), + [0] * len(ph.shape[1:])), ([self.per_device_batch_size // scale * granularity] + - [-1] * len(ph.shape[1:]))) + [-1] * len(ph.shape[1:]))) current_slice.set_shape(ph.shape) device_input_slices.append(current_slice) graph_obj = self.build_graph(device_input_slices) device_grads = graph_obj.gradients(self.optimizer) return Tower( - tf.group(*[batch.initializer - for batch in device_input_batches]), - device_grads, - graph_obj) + tf.group( + *[batch.initializer for batch in device_input_batches]), + device_grads, graph_obj) # Each tower is a copy of the loss graph pinned to a specific device. diff --git a/python/ray/rllib/optimizers/multi_gpu_optimizer.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py index 0c39aab7a..7e4ee2895 100644 --- a/python/ray/rllib/optimizers/multi_gpu_optimizer.py +++ b/python/ray/rllib/optimizers/multi_gpu_optimizer.py @@ -30,8 +30,12 @@ class LocalMultiGPUOptimizer(PolicyOptimizer): may result in unexpected behavior. """ - def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10, - timesteps_per_batch=1024, standardize_fields=[]): + def _init(self, + sgd_batch_size=128, + sgd_stepsize=5e-5, + num_sgd_iter=10, + timesteps_per_batch=1024, + standardize_fields=[]): self.batch_size = sgd_batch_size self.sgd_stepsize = sgd_stepsize self.num_sgd_iter = num_sgd_iter @@ -41,8 +45,8 @@ class LocalMultiGPUOptimizer(PolicyOptimizer): self.devices = ["/cpu:0"] else: self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))] - self.batch_size = int( - sgd_batch_size / len(self.devices)) * len(self.devices) + self.batch_size = int(sgd_batch_size / len(self.devices)) * len( + self.devices) assert self.batch_size % len(self.devices) == 0 assert self.batch_size >= len(self.devices), "batch size too small" self.per_device_batch_size = int(self.batch_size / len(self.devices)) @@ -70,16 +74,15 @@ class LocalMultiGPUOptimizer(PolicyOptimizer): with tf.variable_scope("default", reuse=tf.AUTO_REUSE): if self.policy._state_inputs: rnn_inputs = self.policy._state_inputs + [ - self.policy._seq_lens] + self.policy._seq_lens + ] else: rnn_inputs = [] self.par_opt = LocalSyncParallelOptimizer( - tf.train.AdamOptimizer(self.sgd_stepsize), - self.devices, - [v for _, v in self.policy.loss_inputs()], - rnn_inputs, - self.per_device_batch_size, - self.policy.copy, + tf.train.AdamOptimizer( + self.sgd_stepsize), self.devices, + [v for _, v in self.policy.loss_inputs()], rnn_inputs, + self.per_device_batch_size, self.policy.copy, os.getcwd()) self.sess = self.local_evaluator.tf_sess @@ -117,8 +120,7 @@ class LocalMultiGPUOptimizer(PolicyOptimizer): else: state_keys = [] tuples_per_device = self.par_opt.load_data( - self.sess, - [tuples[k] for k in data_keys], + self.sess, [tuples[k] for k in data_keys], [tuples[k] for k in state_keys]) with self.grad_timer: @@ -141,12 +143,14 @@ class LocalMultiGPUOptimizer(PolicyOptimizer): return _averaged(iter_extra_fetches) def stats(self): - return dict(PolicyOptimizer.stats(self), **{ - "sample_time_ms": round(1000 * self.sample_timer.mean, 3), - "load_time_ms": round(1000 * self.load_timer.mean, 3), - "grad_time_ms": round(1000 * self.grad_timer.mean, 3), - "update_time_ms": round(1000 * self.update_weights_timer.mean, 3), - }) + return dict( + PolicyOptimizer.stats(self), **{ + "sample_time_ms": round(1000 * self.sample_timer.mean, 3), + "load_time_ms": round(1000 * self.load_timer.mean, 3), + "grad_time_ms": round(1000 * self.grad_timer.mean, 3), + "update_time_ms": round(1000 * self.update_weights_timer.mean, + 3), + }) def _averaged(kv): diff --git a/python/ray/rllib/optimizers/policy_optimizer.py b/python/ray/rllib/optimizers/policy_optimizer.py index e4f8ce011..04a76f4ea 100644 --- a/python/ray/rllib/optimizers/policy_optimizer.py +++ b/python/ray/rllib/optimizers/policy_optimizer.py @@ -103,9 +103,10 @@ class PolicyOptimizer(object): """ local_result = [func(self.local_evaluator, 0)] - remote_results = ray.get( - [ev.apply.remote(func, i + 1) - for i, ev in enumerate(self.remote_evaluators)]) + remote_results = ray.get([ + ev.apply.remote(func, i + 1) + for i, ev in enumerate(self.remote_evaluators) + ]) return local_result + remote_results def collect_metrics(self): diff --git a/python/ray/rllib/optimizers/replay_buffer.py b/python/ray/rllib/optimizers/replay_buffer.py index a1e374414..6730a62b2 100644 --- a/python/ray/rllib/optimizers/replay_buffer.py +++ b/python/ray/rllib/optimizers/replay_buffer.py @@ -90,8 +90,10 @@ class ReplayBuffer(object): done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. """ - idxes = [random.randint(0, len(self._storage) - 1) - for _ in range(batch_size)] + idxes = [ + random.randint(0, + len(self._storage) - 1) for _ in range(batch_size) + ] self._num_sampled += batch_size return self._encode_sample(idxes) @@ -142,12 +144,12 @@ class PrioritizedReplayBuffer(ReplayBuffer): reward = np.sign(reward) idx = self._next_idx - super(PrioritizedReplayBuffer, self).add( - obs_t, action, reward, obs_tp1, done, weight) + super(PrioritizedReplayBuffer, self).add(obs_t, action, reward, + obs_tp1, done, weight) if weight is None: weight = self._max_priority - self._it_sum[idx] = weight ** self._alpha - self._it_min[idx] = weight ** self._alpha + self._it_sum[idx] = weight**self._alpha + self._it_min[idx] = weight**self._alpha def _sample_proportional(self, batch_size): res = [] @@ -202,11 +204,11 @@ class PrioritizedReplayBuffer(ReplayBuffer): weights = [] p_min = self._it_min.min() / self._it_sum.sum() - max_weight = (p_min * len(self._storage)) ** (-beta) + max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() - weight = (p_sample * len(self._storage)) ** (-beta) + weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) @@ -231,10 +233,10 @@ class PrioritizedReplayBuffer(ReplayBuffer): for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) - delta = priority ** self._alpha - self._it_sum[idx] + delta = priority**self._alpha - self._it_sum[idx] self._prio_change_stats.push(delta) - self._it_sum[idx] = priority ** self._alpha - self._it_min[idx] = priority ** self._alpha + self._it_sum[idx] = priority**self._alpha + self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority) diff --git a/python/ray/rllib/optimizers/segment_tree.py b/python/ray/rllib/optimizers/segment_tree.py index b412a89bd..e09ed4723 100644 --- a/python/ray/rllib/optimizers/segment_tree.py +++ b/python/ray/rllib/optimizers/segment_tree.py @@ -54,8 +54,7 @@ class SegmentTree(object): return self._operation( self._reduce_helper(start, mid, 2 * node, node_start, mid), self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, - node_end) - ) + node_end)) def reduce(self, start=0, end=None): """Returns result of applying `self.operation` @@ -89,9 +88,8 @@ class SegmentTree(object): self._value[idx] = val idx //= 2 while idx >= 1: - self._value[idx] = self._operation( - self._value[2 * idx], - self._value[2 * idx + 1]) + self._value[idx] = self._operation(self._value[2 * idx], + self._value[2 * idx + 1]) idx //= 2 def __getitem__(self, idx): @@ -102,9 +100,7 @@ class SegmentTree(object): class SumSegmentTree(SegmentTree): def __init__(self, capacity): super(SumSegmentTree, self).__init__( - capacity=capacity, - operation=operator.add, - neutral_element=0.0) + capacity=capacity, operation=operator.add, neutral_element=0.0) def sum(self, start=0, end=None): """Returns arr[start] + ... + arr[end]""" @@ -142,9 +138,7 @@ class SumSegmentTree(SegmentTree): class MinSegmentTree(SegmentTree): def __init__(self, capacity): super(MinSegmentTree, self).__init__( - capacity=capacity, - operation=min, - neutral_element=float('inf')) + capacity=capacity, operation=min, neutral_element=float('inf')) def min(self, start=0, end=None): """Returns min(arr[start], ..., arr[end])""" diff --git a/python/ray/rllib/optimizers/sync_replay_optimizer.py b/python/ray/rllib/optimizers/sync_replay_optimizer.py index 1058b0d5a..834994cd7 100644 --- a/python/ray/rllib/optimizers/sync_replay_optimizer.py +++ b/python/ray/rllib/optimizers/sync_replay_optimizer.py @@ -23,11 +23,16 @@ class SyncReplayOptimizer(PolicyOptimizer): "td_error" array in the info return of compute_gradients(). This error term will be used for sample prioritization.""" - def _init( - self, learning_starts=1000, buffer_size=10000, - prioritized_replay=True, prioritized_replay_alpha=0.6, - prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6, - train_batch_size=32, sample_batch_size=4, clip_rewards=True): + def _init(self, + learning_starts=1000, + buffer_size=10000, + prioritized_replay=True, + prioritized_replay_alpha=0.6, + prioritized_replay_beta=0.4, + prioritized_replay_eps=1e-6, + train_batch_size=32, + sample_batch_size=4, + clip_rewards=True): self.replay_starts = learning_starts self.prioritized_replay_beta = prioritized_replay_beta @@ -43,11 +48,14 @@ class SyncReplayOptimizer(PolicyOptimizer): # Set up replay buffer if prioritized_replay: + def new_buffer(): return PrioritizedReplayBuffer( - buffer_size, alpha=prioritized_replay_alpha, + buffer_size, + alpha=prioritized_replay_alpha, clip_rewards=clip_rewards) else: + def new_buffer(): return ReplayBuffer(buffer_size, clip_rewards) @@ -72,17 +80,19 @@ class SyncReplayOptimizer(PolicyOptimizer): # Handle everything as if multiagent if isinstance(batch, SampleBatch): - batch = MultiAgentBatch( - {DEFAULT_POLICY_ID: batch}, batch.count) + batch = MultiAgentBatch({ + DEFAULT_POLICY_ID: batch + }, batch.count) for policy_id, s in batch.policy_batches.items(): for row in s.rows(): if "weights" not in row: row["weights"] = np.ones_like(row["rewards"]) self.replay_buffers[policy_id].add( - pack_if_needed(row["obs"]), row["actions"], - row["rewards"], pack_if_needed(row["new_obs"]), - row["dones"], row["weights"]) + pack_if_needed(row["obs"]), + row["actions"], row["rewards"], + pack_if_needed(row["new_obs"]), row["dones"], + row["weights"]) if self.num_steps_sampled >= self.replay_starts: self._optimize() @@ -112,27 +122,35 @@ class SyncReplayOptimizer(PolicyOptimizer): with self.replay_timer: for policy_id, replay_buffer in self.replay_buffers.items(): if isinstance(replay_buffer, PrioritizedReplayBuffer): - (obses_t, actions, rewards, obses_tp1, - dones, weights, batch_indexes) = replay_buffer.sample( - self.train_batch_size, - beta=self.prioritized_replay_beta) + (obses_t, actions, rewards, obses_tp1, dones, weights, + batch_indexes) = replay_buffer.sample( + self.train_batch_size, + beta=self.prioritized_replay_beta) else: (obses_t, actions, rewards, obses_tp1, - dones) = replay_buffer.sample(self.train_batch_size) + dones) = replay_buffer.sample(self.train_batch_size) weights = np.ones_like(rewards) - batch_indexes = - np.ones_like(rewards) + batch_indexes = -np.ones_like(rewards) samples[policy_id] = SampleBatch({ - "obs": obses_t, "actions": actions, "rewards": rewards, - "new_obs": obses_tp1, "dones": dones, "weights": weights, - "batch_indexes": batch_indexes}) + "obs": obses_t, + "actions": actions, + "rewards": rewards, + "new_obs": obses_tp1, + "dones": dones, + "weights": weights, + "batch_indexes": batch_indexes + }) return MultiAgentBatch(samples, self.train_batch_size) def stats(self): - return dict(PolicyOptimizer.stats(self), **{ - "sample_time_ms": round(1000 * self.sample_timer.mean, 3), - "replay_time_ms": round(1000 * self.replay_timer.mean, 3), - "grad_time_ms": round(1000 * self.grad_timer.mean, 3), - "update_time_ms": round(1000 * self.update_weights_timer.mean, 3), - "opt_peak_throughput": round(self.grad_timer.mean_throughput, 3), - "opt_samples": round(self.grad_timer.mean_units_processed, 3), - }) + return dict( + PolicyOptimizer.stats(self), **{ + "sample_time_ms": round(1000 * self.sample_timer.mean, 3), + "replay_time_ms": round(1000 * self.replay_timer.mean, 3), + "grad_time_ms": round(1000 * self.grad_timer.mean, 3), + "update_time_ms": round(1000 * self.update_weights_timer.mean, + 3), + "opt_peak_throughput": round(self.grad_timer.mean_throughput, + 3), + "opt_samples": round(self.grad_timer.mean_units_processed, 3), + }) diff --git a/python/ray/rllib/optimizers/sync_samples_optimizer.py b/python/ray/rllib/optimizers/sync_samples_optimizer.py index 6b4483fb1..76d2d9c46 100644 --- a/python/ray/rllib/optimizers/sync_samples_optimizer.py +++ b/python/ray/rllib/optimizers/sync_samples_optimizer.py @@ -51,10 +51,13 @@ class SyncSamplesOptimizer(PolicyOptimizer): return fetches def stats(self): - return dict(PolicyOptimizer.stats(self), **{ - "sample_time_ms": round(1000 * self.sample_timer.mean, 3), - "grad_time_ms": round(1000 * self.grad_timer.mean, 3), - "update_time_ms": round(1000 * self.update_weights_timer.mean, 3), - "opt_peak_throughput": round(self.grad_timer.mean_throughput, 3), - "opt_samples": round(self.grad_timer.mean_units_processed, 3), - }) + return dict( + PolicyOptimizer.stats(self), **{ + "sample_time_ms": round(1000 * self.sample_timer.mean, 3), + "grad_time_ms": round(1000 * self.grad_timer.mean, 3), + "update_time_ms": round(1000 * self.update_weights_timer.mean, + 3), + "opt_peak_throughput": round(self.grad_timer.mean_throughput, + 3), + "opt_samples": round(self.grad_timer.mean_units_processed, 3), + }) diff --git a/python/ray/rllib/rollout.py b/python/ray/rllib/rollout.py index 58639c361..0e33e3d6c 100755 --- a/python/ray/rllib/rollout.py +++ b/python/ray/rllib/rollout.py @@ -15,7 +15,6 @@ from ray.rllib.agents.agent import get_agent_class from ray.rllib.agents.dqn.common.wrappers import wrap_dqn from ray.rllib.models import ModelCatalog - EXAMPLE_USAGE = """ Example Usage via RLlib CLI: rllib rollout /tmp/ray/checkpoint_dir/checkpoint-0 --run DQN @@ -32,30 +31,37 @@ def create_parser(parser_creator=None): parser = parser_creator( formatter_class=argparse.RawDescriptionHelpFormatter, description="Roll out a reinforcement learning agent " - "given a checkpoint.", epilog=EXAMPLE_USAGE) + "given a checkpoint.", + epilog=EXAMPLE_USAGE) parser.add_argument( "checkpoint", type=str, help="Checkpoint from which to roll out.") required_named = parser.add_argument_group("required named arguments") required_named.add_argument( - "--run", type=str, required=True, + "--run", + type=str, + required=True, help="The algorithm or model to train. This may refer to the name " - "of a built-on algorithm (e.g. RLLib's DQN or PPO), or a " - "user-defined trainable function or class registered in the " - "tune registry.") + "of a built-on algorithm (e.g. RLLib's DQN or PPO), or a " + "user-defined trainable function or class registered in the " + "tune registry.") required_named.add_argument( "--env", type=str, help="The gym environment to use.") parser.add_argument( - "--no-render", default=False, action="store_const", const=True, + "--no-render", + default=False, + action="store_const", + const=True, help="Surpress rendering of the environment.") parser.add_argument( "--steps", default=None, help="Number of steps to roll out.") + parser.add_argument("--out", default=None, help="Output filename.") parser.add_argument( - "--out", default=None, help="Output filename.") - parser.add_argument( - "--config", default="{}", type=json.loads, + "--config", + default="{}", + type=json.loads, help="Algorithm-specific configuration (e.g. env, hyperparams). " - "Surpresses loading of configuration from checkpoint.") + "Surpresses loading of configuration from checkpoint.") return parser diff --git a/python/ray/rllib/scripts.py b/python/ray/rllib/scripts.py index ede37efc5..cc48b83cf 100644 --- a/python/ray/rllib/scripts.py +++ b/python/ray/rllib/scripts.py @@ -9,7 +9,6 @@ import argparse from ray.rllib import train from ray.rllib import rollout - EXAMPLE_USAGE = """ Example usage for training: rllib train --run DQN --env CartPole-v0 diff --git a/python/ray/rllib/test/mock_evaluator.py b/python/ray/rllib/test/mock_evaluator.py index 83c0f354e..e11b097e7 100644 --- a/python/ray/rllib/test/mock_evaluator.py +++ b/python/ray/rllib/test/mock_evaluator.py @@ -15,16 +15,17 @@ class _MockEvaluator(object): self._sample_count = sample_count self.obs_filter = MeanStdFilter(()) self.rew_filter = MeanStdFilter(()) - self.filters = {"obs_filter": self.obs_filter, - "rew_filter": self.rew_filter} + self.filters = { + "obs_filter": self.obs_filter, + "rew_filter": self.rew_filter + } def sample(self): samples_dict = {"observations": [], "rewards": []} for i in range(self._sample_count): samples_dict["observations"].append( self.obs_filter(np.random.randn())) - samples_dict["rewards"].append( - self.rew_filter(np.random.randn())) + samples_dict["rewards"].append(self.rew_filter(np.random.randn())) return SampleBatch(samples_dict) def compute_gradients(self, samples): diff --git a/python/ray/rllib/test/test_catalog.py b/python/ray/rllib/test/test_catalog.py index 3e8a08990..454c9255c 100644 --- a/python/ray/rllib/test/test_catalog.py +++ b/python/ray/rllib/test/test_catalog.py @@ -8,8 +8,8 @@ import ray from ray.rllib.models import ModelCatalog from ray.rllib.models.model import Model -from ray.rllib.models.preprocessors import ( - NoPreprocessor, OneHotPreprocessor, Preprocessor) +from ray.rllib.models.preprocessors import (NoPreprocessor, OneHotPreprocessor, + Preprocessor) from ray.rllib.models.fcnet import FullyConnectedNetwork from ray.rllib.models.visionnet import VisionNetwork @@ -44,9 +44,11 @@ class ModelCatalogTest(unittest.TestCase): class TupleEnv(object): def __init__(self): self.observation_space = Tuple( - [Discrete(5), Box(0, 1, shape=(3,), dtype=np.float32)]) + [Discrete(5), + Box(0, 1, shape=(3, ), dtype=np.float32)]) + p1 = ModelCatalog.get_preprocessor(TupleEnv()) - self.assertEqual(p1.shape, (8,)) + self.assertEqual(p1.shape, (8, )) self.assertEqual( list(p1.transform((0, [1, 2, 3]))), [float(x) for x in [1, 0, 0, 0, 0, 1, 2, 3]]) diff --git a/python/ray/rllib/test/test_checkpoint_restore.py b/python/ray/rllib/test/test_checkpoint_restore.py index f94e08b5a..1776ee8a1 100644 --- a/python/ray/rllib/test/test_checkpoint_restore.py +++ b/python/ray/rllib/test/test_checkpoint_restore.py @@ -20,12 +20,24 @@ def get_mean_action(alg, obs): ray.init(num_cpus=10) CONFIGS = { - "ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100, - "num_workers": 2}, + "ES": { + "episodes_per_batch": 10, + "timesteps_per_batch": 100, + "num_workers": 2 + }, "DQN": {}, - "DDPG": {"noise_scale": 0.0, "timesteps_per_iteration": 100}, - "PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000, "num_workers": 2}, - "A3C": {"num_workers": 1}, + "DDPG": { + "noise_scale": 0.0, + "timesteps_per_iteration": 100 + }, + "PPO": { + "num_sgd_iter": 5, + "timesteps_per_batch": 1000, + "num_workers": 2 + }, + "A3C": { + "num_workers": 1 + }, } diff --git a/python/ray/rllib/test/test_filters.py b/python/ray/rllib/test/test_filters.py index 7cb7da6b5..664b1388e 100644 --- a/python/ray/rllib/test/test_filters.py +++ b/python/ray/rllib/test/test_filters.py @@ -13,7 +13,7 @@ from ray.rllib.test.mock_evaluator import _MockEvaluator class RunningStatTest(unittest.TestCase): def testRunningStat(self): - for shp in ((), (3,), (3, 4)): + for shp in ((), (3, ), (3, 4)): li = [] rs = RunningStat(shp) for _ in range(5): @@ -22,12 +22,12 @@ class RunningStatTest(unittest.TestCase): li.append(val) m = np.mean(li, axis=0) self.assertTrue(np.allclose(rs.mean, m)) - v = (np.square(m) if (len(li) == 1) - else np.var(li, ddof=1, axis=0)) + v = (np.square(m) + if (len(li) == 1) else np.var(li, ddof=1, axis=0)) self.assertTrue(np.allclose(rs.var, v)) def testCombiningStat(self): - for shape in [(), (3,), (3, 4)]: + for shape in [(), (3, ), (3, 4)]: li = [] rs1 = RunningStat(shape) rs2 = RunningStat(shape) @@ -48,7 +48,7 @@ class RunningStatTest(unittest.TestCase): class MSFTest(unittest.TestCase): def testBasic(self): - for shape in [(), (3,), (3, 4, 4)]: + for shape in [(), (3, ), (3, 4, 4)]: filt = MeanStdFilter(shape) for i in range(5): filt(np.ones(shape)) @@ -93,8 +93,10 @@ class FilterManagerTest(unittest.TestCase): remote_e = RemoteEvaluator.remote(sample_count=10) remote_e.sample.remote() - FilterManager.synchronize( - {"obs_filter": filt1, "rew_filter": filt1.copy()}, [remote_e]) + FilterManager.synchronize({ + "obs_filter": filt1, + "rew_filter": filt1.copy() + }, [remote_e]) filters = ray.get(remote_e.get_filters.remote()) obs_f = filters["obs_filter"] diff --git a/python/ray/rllib/test/test_lstm.py b/python/ray/rllib/test/test_lstm.py index 0e92901fd..0fd6dffc3 100644 --- a/python/ray/rllib/test/test_lstm.py +++ b/python/ray/rllib/test/test_lstm.py @@ -10,22 +10,15 @@ from ray.rllib.models.lstm import chop_into_sequences class LSTMUtilsTest(unittest.TestCase): def testBasic(self): t = [1, 2, 3, 1, 2, 3, 4, 5] - f = [ - [101, 102, 103, 201, 202, 203, 204, 205], - [[101], [102], [103], [201], [202], [203], [204], [205]] - ] + f = [[101, 102, 103, 201, 202, 203, 204, 205], + [[101], [102], [103], [201], [202], [203], [204], [205]]] s = [[209, 208, 207, 109, 108, 107, 106, 105]] f_pad, s_init, seq_lens = chop_into_sequences(t, f, s, 4) - self.assertEqual( - [f.tolist() for f in f_pad], - [ - [101, 102, 103, 0, - 201, 202, 203, 204, - 205, 0, 0, 0], - [[101], [102], [103], [0], - [201], [202], [203], [204], - [205], [0], [0], [0]], - ]) + self.assertEqual([f.tolist() for f in f_pad], [ + [101, 102, 103, 0, 201, 202, 203, 204, 205, 0, 0, 0], + [[101], [102], [103], [0], [201], [202], [203], [204], [205], [0], + [0], [0]], + ]) self.assertEqual([s.tolist() for s in s_init], [[209, 109, 105]]) self.assertEqual(seq_lens.tolist(), [3, 4, 1]) diff --git a/python/ray/rllib/test/test_multi_agent_env.py b/python/ray/rllib/test/test_multi_agent_env.py index fcaabfdd9..c6ce25ed4 100644 --- a/python/ray/rllib/test/test_multi_agent_env.py +++ b/python/ray/rllib/test/test_multi_agent_env.py @@ -129,12 +129,21 @@ class TestMultiAgentEnv(unittest.TestCase): obs, rew, done, info = env.step({0: 0, 1: 0, 2: 0, 3: 0}) self.assertEqual(obs, {0: 0, 1: 0, 2: 0, 3: 0}) self.assertEqual(rew, {0: 1, 1: 1, 2: 1, 3: 1}) - self.assertEqual( - done, - {0: False, 1: False, 2: False, 3: False, "__all__": False}) + self.assertEqual(done, { + 0: False, + 1: False, + 2: False, + 3: False, + "__all__": False + }) obs, rew, done, info = env.step({0: 0, 1: 0, 2: 0, 3: 0}) - self.assertEqual( - done, {0: True, 1: True, 2: True, 3: True, "__all__": True}) + self.assertEqual(done, { + 0: True, + 1: True, + 2: True, + 3: True, + "__all__": True + }) def testRoundRobinMock(self): env = RoundRobinMultiAgent(2) @@ -156,24 +165,51 @@ class TestMultiAgentEnv(unittest.TestCase): self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}}) self.assertEqual(rew, {0: {0: None, 1: None}, 1: {0: None, 1: None}}) self.assertEqual( - dones, - {0: {0: False, 1: False, "__all__": False}, - 1: {0: False, 1: False, "__all__": False}}) + dones, { + 0: { + 0: False, + 1: False, + "__all__": False + }, + 1: { + 0: False, + 1: False, + "__all__": False + } + }) for _ in range(24): env.send_actions({0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}}) obs, rew, dones, _, _ = env.poll() self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}}) self.assertEqual(rew, {0: {0: 1, 1: 1}, 1: {0: 1, 1: 1}}) self.assertEqual( - dones, - {0: {0: False, 1: False, "__all__": False}, - 1: {0: False, 1: False, "__all__": False}}) + dones, { + 0: { + 0: False, + 1: False, + "__all__": False + }, + 1: { + 0: False, + 1: False, + "__all__": False + } + }) env.send_actions({0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}}) obs, rew, dones, _, _ = env.poll() self.assertEqual( - dones, - {0: {0: True, 1: True, "__all__": True}, - 1: {0: True, 1: True, "__all__": True}}) + dones, { + 0: { + 0: True, + 1: True, + "__all__": True + }, + 1: { + 0: True, + 1: True, + "__all__": True + } + }) # Reset processing self.assertRaises( @@ -186,9 +222,18 @@ class TestMultiAgentEnv(unittest.TestCase): self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}}) self.assertEqual(rew, {0: {0: 1, 1: 1}, 1: {0: 1, 1: 1}}) self.assertEqual( - dones, - {0: {0: False, 1: False, "__all__": False}, - 1: {0: False, 1: False, "__all__": False}}) + dones, { + 0: { + 0: False, + 1: False, + "__all__": False + }, + 1: { + 0: False, + 1: False, + "__all__": False + } + }) def testVectorizeRoundRobin(self): env = _MultiAgentEnvToAsync(lambda: RoundRobinMultiAgent(2), [], 2) @@ -217,9 +262,8 @@ class TestMultiAgentEnv(unittest.TestCase): self.assertEqual(batch.count, 50) self.assertEqual(batch.policy_batches["p0"].count, 150) self.assertEqual(batch.policy_batches["p1"].count, 100) - self.assertEqual( - batch.policy_batches["p0"]["t"].tolist(), - list(range(25)) * 6) + self.assertEqual(batch.policy_batches["p0"]["t"].tolist(), + list(range(25)) * 6) def testMultiAgentSampleRoundRobin(self): act_space = gym.spaces.Discrete(2) @@ -236,21 +280,16 @@ class TestMultiAgentEnv(unittest.TestCase): # since we round robin introduce agents into the env, some of the env # steps don't count as proper transitions self.assertEqual(batch.policy_batches["p0"].count, 42) - self.assertEqual( - batch.policy_batches["p0"]["obs"].tolist()[:10], - [0, 1, 2, 3, 4] * 2) - self.assertEqual( - batch.policy_batches["p0"]["new_obs"].tolist()[:10], - [1, 2, 3, 4, 5] * 2) - self.assertEqual( - batch.policy_batches["p0"]["rewards"].tolist()[:10], - [100, 100, 100, 100, 0] * 2) - self.assertEqual( - batch.policy_batches["p0"]["dones"].tolist()[:10], - [False, False, False, False, True] * 2) - self.assertEqual( - batch.policy_batches["p0"]["t"].tolist()[:10], - [4, 9, 14, 19, 24, 5, 10, 15, 20, 25]) + self.assertEqual(batch.policy_batches["p0"]["obs"].tolist()[:10], + [0, 1, 2, 3, 4] * 2) + self.assertEqual(batch.policy_batches["p0"]["new_obs"].tolist()[:10], + [1, 2, 3, 4, 5] * 2) + self.assertEqual(batch.policy_batches["p0"]["rewards"].tolist()[:10], + [100, 100, 100, 100, 0] * 2) + self.assertEqual(batch.policy_batches["p0"]["dones"].tolist()[:10], + [False, False, False, False, True] * 2) + self.assertEqual(batch.policy_batches["p0"]["t"].tolist()[:10], + [4, 9, 14, 19, 24, 5, 10, 15, 20, 25]) def testTrainMultiCartpoleSinglePolicy(self): n = 10 @@ -289,11 +328,17 @@ class TestMultiAgentEnv(unittest.TestCase): policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50) if optimizer_cls == AsyncGradientsOptimizer: - remote_evs = [PolicyEvaluator.as_remote().remote( - env_creator=lambda _: MultiCartpole(n), - policy_graph=policies, - policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], - batch_steps=50)] + + def policy_mapper(agent_id): + return ["p1", "p2"][agent_id % 2] + + remote_evs = [ + PolicyEvaluator.as_remote().remote( + env_creator=lambda _: MultiCartpole(n), + policy_graph=policies, + policy_mapping_fn=policy_mapper, + batch_steps=50) + ] else: remote_evs = [] optimizer = optimizer_cls(ev, remote_evs, {}) @@ -330,8 +375,8 @@ class TestMultiAgentEnv(unittest.TestCase): obs_space = env.observation_space policies = {} for i in range(20): - policies["pg_{}".format(i)] = ( - PGPolicyGraph, obs_space, act_space, {}) + policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space, + {}) policy_ids = list(policies.keys()) ev = PolicyEvaluator( env_creator=lambda _: MultiCartpole(n), diff --git a/python/ray/rllib/test/test_optimizers.py b/python/ray/rllib/test/test_optimizers.py index c39327255..6a5022d36 100644 --- a/python/ray/rllib/test/test_optimizers.py +++ b/python/ray/rllib/test/test_optimizers.py @@ -21,8 +21,8 @@ class AsyncOptimizerTest(unittest.TestCase): local = _MockEvaluator() remotes = ray.remote(_MockEvaluator) remote_evaluators = [remotes.remote() for i in range(5)] - test_optimizer = AsyncGradientsOptimizer( - local, remote_evaluators, {"grads_per_step": 10}) + test_optimizer = AsyncGradientsOptimizer(local, remote_evaluators, + {"grads_per_step": 10}) test_optimizer.step() self.assertTrue(all(local.get_weights() == 0)) diff --git a/python/ray/rllib/test/test_policy_evaluator.py b/python/ray/rllib/test/test_policy_evaluator.py index 2817174f7..472625fb3 100644 --- a/python/ray/rllib/test/test_policy_evaluator.py +++ b/python/ray/rllib/test/test_policy_evaluator.py @@ -66,8 +66,7 @@ class MockEnv2(gym.Env): class MockVectorEnv(VectorEnv): def __init__(self, episode_length, num_envs): - self.envs = [ - MockEnv(episode_length) for _ in range(num_envs)] + self.envs = [MockEnv(episode_length) for _ in range(num_envs)] self.observation_space = gym.spaces.Discrete(1) self.action_space = gym.spaces.Discrete(2) self.num_envs = num_envs @@ -102,7 +101,10 @@ class TestPolicyEvaluator(unittest.TestCase): def testQueryEvaluators(self): register_env("test", lambda _: gym.make("CartPole-v0")) pg = PGAgent( - env="test", config={"num_workers": 2, "sample_batch_size": 5}) + env="test", config={ + "num_workers": 2, + "sample_batch_size": 5 + }) results = pg.optimizer.foreach_evaluator(lambda ev: ev.batch_steps) results2 = pg.optimizer.foreach_evaluator_with_index( lambda ev, i: (i, ev.batch_steps)) @@ -112,10 +114,12 @@ class TestPolicyEvaluator(unittest.TestCase): def testMetrics(self): ev = PolicyEvaluator( env_creator=lambda _: MockEnv(episode_length=10), - policy_graph=MockPolicyGraph, batch_mode="complete_episodes") + policy_graph=MockPolicyGraph, + batch_mode="complete_episodes") remote_ev = PolicyEvaluator.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), - policy_graph=MockPolicyGraph, batch_mode="complete_episodes") + policy_graph=MockPolicyGraph, + batch_mode="complete_episodes") ev.sample() ray.get(remote_ev.sample.remote()) result = collect_metrics(ev, [remote_ev]) @@ -149,7 +153,8 @@ class TestPolicyEvaluator(unittest.TestCase): env_creator=lambda _: MockEnv(episode_length=20), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", - batch_steps=16, num_envs=8) + batch_steps=16, + num_envs=8) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 16) @@ -175,7 +180,8 @@ class TestPolicyEvaluator(unittest.TestCase): env_creator=lambda _: MockEnv(episode_length=8), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", - batch_steps=16, num_envs=4) + batch_steps=16, + num_envs=4) batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) @@ -186,8 +192,7 @@ class TestPolicyEvaluator(unittest.TestCase): def testVectorEnvSupport(self): ev = PolicyEvaluator( - env_creator=lambda _: MockVectorEnv( - episode_length=20, num_envs=8), + env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=10) diff --git a/python/ray/rllib/test/test_serving_env.py b/python/ray/rllib/test/test_serving_env.py index eadfe1164..5d9dd641a 100644 --- a/python/ray/rllib/test/test_serving_env.py +++ b/python/ray/rllib/test/test_serving_env.py @@ -83,8 +83,8 @@ class MultiServing(ServingEnv): def __init__(self, env_creator): self.env_creator = env_creator self.env = env_creator() - ServingEnv.__init__( - self, self.env.action_space, self.env.observation_space) + ServingEnv.__init__(self, self.env.action_space, + self.env.observation_space) def run(self): envs = [self.env_creator() for _ in range(5)] @@ -97,8 +97,7 @@ class MultiServing(ServingEnv): eids[i] = uuid.uuid4().hex self.start_episode(episode_id=eids[i]) cur_obs[i] = envs[i].reset() - actions = [ - self.get_action(eids[i], cur_obs[i]) for i in active] + actions = [self.get_action(eids[i], cur_obs[i]) for i in active] for i, action in zip(active, actions): obs, reward, done, _ = envs[i].step(action) cur_obs[i] = obs @@ -164,8 +163,7 @@ class TestServingEnv(unittest.TestCase): raise Exception("failed to improve reward") def testTrainCartpole(self): - register_env( - "test", lambda _: SimpleServing(gym.make("CartPole-v0"))) + register_env("test", lambda _: SimpleServing(gym.make("CartPole-v0"))) pg = PGAgent(env="test", config={"num_workers": 0}) for i in range(100): result = pg.train() @@ -176,8 +174,8 @@ class TestServingEnv(unittest.TestCase): raise Exception("failed to improve reward") def testTrainCartpoleMulti(self): - register_env( - "test2", lambda _: MultiServing(lambda: gym.make("CartPole-v0"))) + register_env("test2", + lambda _: MultiServing(lambda: gym.make("CartPole-v0"))) pg = PGAgent(env="test2", config={"num_workers": 0}) for i in range(100): result = pg.train() diff --git a/python/ray/rllib/test/test_supported_spaces.py b/python/ray/rllib/test/test_supported_spaces.py index 1189168e8..bbdbda4b0 100644 --- a/python/ray/rllib/test/test_supported_spaces.py +++ b/python/ray/rllib/test/test_supported_spaces.py @@ -14,27 +14,29 @@ from ray.tune.registry import register_env ACTION_SPACES_TO_TEST = { "discrete": Discrete(5), - "vector": Box(0.0, 1.0, (5,), dtype=np.float32), + "vector": Box(0.0, 1.0, (5, ), dtype=np.float32), "simple_tuple": Tuple([ - Box(0.0, 1.0, (5,), dtype=np.float32), - Box(0.0, 1.0, (5,), dtype=np.float32)]), + Box(0.0, 1.0, (5, ), dtype=np.float32), + Box(0.0, 1.0, (5, ), dtype=np.float32) + ]), "implicit_tuple": [ - Box(0.0, 1.0, (5,), dtype=np.float32), - Box(0.0, 1.0, (5,), dtype=np.float32)], + Box(0.0, 1.0, (5, ), dtype=np.float32), + Box(0.0, 1.0, (5, ), dtype=np.float32) + ], } OBSERVATION_SPACES_TO_TEST = { "discrete": Discrete(5), - "vector": Box(0.0, 1.0, (5,), dtype=np.float32), + "vector": Box(0.0, 1.0, (5, ), dtype=np.float32), "image": Box(0.0, 1.0, (80, 80, 1), dtype=np.float32), "atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32), - "atari_ram": Box(0.0, 1.0, (128,), dtype=np.float32), + "atari_ram": Box(0.0, 1.0, (128, ), dtype=np.float32), "simple_tuple": Tuple([ - Box(0.0, 1.0, (5,), dtype=np.float32), - Box(0.0, 1.0, (5,), dtype=np.float32)]), - "mixed_tuple": Tuple([ - Discrete(10), - Box(0.0, 1.0, (5,), dtype=np.float32)]), + Box(0.0, 1.0, (5, ), dtype=np.float32), + Box(0.0, 1.0, (5, ), dtype=np.float32) + ]), + "mixed_tuple": Tuple( + [Discrete(10), Box(0.0, 1.0, (5, ), dtype=np.float32)]), } @@ -90,30 +92,33 @@ class ModelSupportedSpaces(unittest.TestCase): stats = {} check_support("DDPG", {"timesteps_per_iteration": 1}, stats) check_support("DQN", {"timesteps_per_iteration": 1}, stats) + check_support("A3C", { + "num_workers": 1, + "optimizer": { + "grads_per_step": 1 + } + }, stats) check_support( - "A3C", {"num_workers": 1, "optimizer": {"grads_per_step": 1}}, - stats) + "PPO", { + "num_workers": 1, + "num_sgd_iter": 1, + "timesteps_per_batch": 1, + "sgd_batchsize": 1 + }, stats) check_support( - "PPO", - {"num_workers": 1, "num_sgd_iter": 1, "timesteps_per_batch": 1, - "sgd_batchsize": 1}, - stats) - check_support( - "ES", - {"num_workers": 1, "noise_size": 10000000, - "episodes_per_batch": 1, "timesteps_per_batch": 1}, - stats) - check_support( - "PG", - {"num_workers": 1, "optimizer": {}}, - stats) + "ES", { + "num_workers": 1, + "noise_size": 10000000, + "episodes_per_batch": 1, + "timesteps_per_batch": 1 + }, stats) + check_support("PG", {"num_workers": 1, "optimizer": {}}, stats) num_unexpected_errors = 0 for (alg, a_name, o_name), stat in sorted(stats.items()): if stat not in ["ok", "unsupported"]: num_unexpected_errors += 1 - print( - alg, "action_space", a_name, "obs_space", o_name, - "result", stat) + print(alg, "action_space", a_name, "obs_space", o_name, "result", + stat) self.assertEqual(num_unexpected_errors, 0) @@ -123,7 +128,7 @@ if __name__ == "__main__": "discrete": Discrete(5), } OBSERVATION_SPACES_TO_TEST = { - "vector": Box(0.0, 1.0, (5,), dtype=np.float32), + "vector": Box(0.0, 1.0, (5, ), dtype=np.float32), "atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32), } unittest.main(verbosity=2) diff --git a/python/ray/rllib/train.py b/python/ray/rllib/train.py index 736eb7ec2..ac18025a9 100755 --- a/python/ray/rllib/train.py +++ b/python/ray/rllib/train.py @@ -11,7 +11,6 @@ import ray from ray.tune.config_parser import make_parser, resources_to_json from ray.tune.tune import _make_scheduler, run_experiments - EXAMPLE_USAGE = """ Training example via RLlib CLI: rllib train --run DQN --env CartPole-v0 @@ -35,29 +34,41 @@ def create_parser(parser_creator=None): # See also the base parser definition in ray/tune/config_parser.py parser.add_argument( - "--redis-address", default=None, type=str, + "--redis-address", + default=None, + type=str, help="The Redis address of the cluster.") parser.add_argument( - "--ray-num-cpus", default=None, type=int, + "--ray-num-cpus", + default=None, + type=int, help="--num-cpus to pass to Ray." - " This only has an affect in local mode.") + " This only has an affect in local mode.") parser.add_argument( - "--ray-num-gpus", default=None, type=int, + "--ray-num-gpus", + default=None, + type=int, help="--num-gpus to pass to Ray." - " This only has an affect in local mode.") + " This only has an affect in local mode.") parser.add_argument( - "--experiment-name", default="default", type=str, + "--experiment-name", + default="default", + type=str, help="Name of the subdirectory under `local_dir` to put results in.") parser.add_argument( "--env", default=None, type=str, help="The gym environment to use.") parser.add_argument( - "--queue-trials", action='store_true', + "--queue-trials", + action='store_true', help=( "Whether to queue trials when the cluster does not currently have " "enough resources to launch one. This should be set to True when " "running on an autoscaling cluster to enable automatic scale-up.")) parser.add_argument( - "-f", "--config-file", default=None, type=str, + "-f", + "--config-file", + default=None, + type=str, help="If specified, use config options from this file. Note that this " "overrides any trial-specific options set via flags above.") return parser @@ -93,9 +104,11 @@ def run(args, parser): ray.init( redis_address=args.redis_address, - num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus) + num_cpus=args.ray_num_cpus, + num_gpus=args.ray_num_gpus) run_experiments( - experiments, scheduler=_make_scheduler(args), + experiments, + scheduler=_make_scheduler(args), queue_trials=args.queue_trials) diff --git a/python/ray/rllib/tuned_examples/generate_regression_tests.py b/python/ray/rllib/tuned_examples/generate_regression_tests.py index 12cbe2d61..3196bd4d0 100755 --- a/python/ray/rllib/tuned_examples/generate_regression_tests.py +++ b/python/ray/rllib/tuned_examples/generate_regression_tests.py @@ -6,10 +6,8 @@ import re import os import os.path as osp - CONFIG_DIR = osp.join(osp.dirname(osp.abspath(__file__)), "regression_tests") - TEMPLATE = """ class Test{name}(Regression): _file = "{filename}" diff --git a/python/ray/rllib/tuned_examples/regression_tests/regression_test.py b/python/ray/rllib/tuned_examples/regression_tests/regression_test.py index 58433da57..a4624f372 100644 --- a/python/ray/rllib/tuned_examples/regression_tests/regression_test.py +++ b/python/ray/rllib/tuned_examples/regression_tests/regression_test.py @@ -15,7 +15,6 @@ import yaml import ray from ray import tune - CONFIG_DIR = os.path.dirname(os.path.abspath(__file__)) diff --git a/python/ray/rllib/tuned_examples/run_regression_tests.py b/python/ray/rllib/tuned_examples/run_regression_tests.py index 3bb7d5224..65ba1a310 100755 --- a/python/ray/rllib/tuned_examples/run_regression_tests.py +++ b/python/ray/rllib/tuned_examples/run_regression_tests.py @@ -8,7 +8,6 @@ import yaml import ray from ray.tune import run_experiments - if __name__ == '__main__': experiments = {} @@ -29,5 +28,4 @@ if __name__ == '__main__': num_failures += 1 if num_failures: - raise Exception( - "{} trials did not converge".format(num_failures)) + raise Exception("{} trials did not converge".format(num_failures)) diff --git a/python/ray/rllib/utils/compression.py b/python/ray/rllib/utils/compression.py index dee8d875d..ddef7a6ab 100644 --- a/python/ray/rllib/utils/compression.py +++ b/python/ray/rllib/utils/compression.py @@ -11,10 +11,9 @@ try: import lz4.frame LZ4_ENABLED = True except ImportError: - print( - "WARNING: lz4 not available, disabling sample compression. " - "This will significantly impact RLlib performance. " - "To install lz4, run `pip install lz4`.") + print("WARNING: lz4 not available, disabling sample compression. " + "This will significantly impact RLlib performance. " + "To install lz4, run `pip install lz4`.") LZ4_ENABLED = False diff --git a/python/ray/rllib/utils/filter.py b/python/ray/rllib/utils/filter.py index 6e60b4e5f..b2a361948 100644 --- a/python/ray/rllib/utils/filter.py +++ b/python/ray/rllib/utils/filter.py @@ -59,7 +59,6 @@ class NoFilter(Filter): # http://www.johndcook.com/blog/standard_deviation/ class RunningStat(object): - def __init__(self, shape=None): self._n = 0 self._M = np.zeros(shape) @@ -227,8 +226,8 @@ class MeanStdFilter(Filter): def __repr__(self): return 'MeanStdFilter({}, {}, {}, {}, {}, {})'.format( - self.shape, self.demean, self.destd, - self.clip, self.rs, self.buffer) + self.shape, self.demean, self.destd, self.clip, self.rs, + self.buffer) class ConcurrentMeanStdFilter(MeanStdFilter): @@ -242,6 +241,7 @@ class ConcurrentMeanStdFilter(MeanStdFilter): def wrapper(*args, **kwargs): with self._lock: return func(*args, **kwargs) + return wrapper self.__getattribute__ = lock_wrap(self.__getattribute__) @@ -260,8 +260,8 @@ class ConcurrentMeanStdFilter(MeanStdFilter): def __repr__(self): return 'ConcurrentMeanStdFilter({}, {}, {}, {}, {}, {})'.format( - self.shape, self.demean, self.destd, - self.clip, self.rs, self.buffer) + self.shape, self.demean, self.destd, self.clip, self.rs, + self.buffer) def get_filter(filter_config, shape): @@ -273,5 +273,4 @@ def get_filter(filter_config, shape): elif filter_config == "NoFilter": return NoFilter() else: - raise Exception("Unknown observation_filter: " + - str(filter_config)) + raise Exception("Unknown observation_filter: " + str(filter_config)) diff --git a/python/ray/rllib/utils/policy_server.py b/python/ray/rllib/utils/policy_server.py index 554d74974..7a5a05093 100644 --- a/python/ray/rllib/utils/policy_server.py +++ b/python/ray/rllib/utils/policy_server.py @@ -75,14 +75,14 @@ def _make_handler(serving_env): response["action"] = serving_env.get_action( args["episode_id"], args["observation"]) elif command == PolicyClient.LOG_ACTION: - serving_env.log_action( - args["episode_id"], args["observation"], args["action"]) + serving_env.log_action(args["episode_id"], args["observation"], + args["action"]) elif command == PolicyClient.LOG_RETURNS: - serving_env.log_returns( - args["episode_id"], args["reward"], args["info"]) + serving_env.log_returns(args["episode_id"], args["reward"], + args["info"]) elif command == PolicyClient.END_EPISODE: - serving_env.end_episode( - args["episode_id"], args["observation"]) + serving_env.end_episode(args["episode_id"], + args["observation"]) else: raise Exception("Unknown command: {}".format(command)) return response diff --git a/python/ray/rllib/utils/reshaper.py b/python/ray/rllib/utils/reshaper.py index c0687b488..e9c165212 100644 --- a/python/ray/rllib/utils/reshaper.py +++ b/python/ray/rllib/utils/reshaper.py @@ -7,6 +7,7 @@ class Reshaper(object): This class keeps track of where in the flattened observation space we should be slicing and what the new shapes should be """ + def __init__(self, env_space): self.shapes = [] self.slice_positions = [] @@ -24,8 +25,8 @@ class Reshaper(object): if len(self.slice_positions) == 0: self.slice_positions.append(np.product(arr_shape)) else: - self.slice_positions.append(np.product(arr_shape) + - self.slice_positions[-1]) + self.slice_positions.append( + np.product(arr_shape) + self.slice_positions[-1]) else: self.shapes.append(np.asarray(env_space.shape)) self.slice_positions.append(np.product(env_space.shape)) @@ -38,11 +39,11 @@ class Reshaper(object): def split_tensor(self, tensor, axis=-1): # FIXME (ev) This won't work for mixed action distributions like # one agent Gaussian one agent discrete - slice_rescale = int(tensor.shape.as_list()[axis] / - int(np.sum(self.get_slice_lengths()))) - return tf.split(tensor, slice_rescale*self.get_slice_lengths(), - axis=axis) + slice_rescale = int(tensor.shape.as_list()[axis] / int( + np.sum(self.get_slice_lengths()))) + return tf.split( + tensor, slice_rescale * self.get_slice_lengths(), axis=axis) def split_number(self, number): slice_rescale = int(number / int(np.sum(self.get_slice_lengths()))) - return slice_rescale*self.get_slice_lengths() + return slice_rescale * self.get_slice_lengths() diff --git a/python/ray/rllib/utils/schedules.py b/python/ray/rllib/utils/schedules.py index d9ceb2f76..41518e6b9 100644 --- a/python/ray/rllib/utils/schedules.py +++ b/python/ray/rllib/utils/schedules.py @@ -39,10 +39,10 @@ def linear_interpolation(l, r, alpha): class PiecewiseSchedule(object): - def __init__( - self, endpoints, interpolation=linear_interpolation, - outside_value=None): - + def __init__(self, + endpoints, + interpolation=linear_interpolation, + outside_value=None): """Piecewise schedule. endpoints: [(int, int)] diff --git a/python/ray/rllib/utils/tf_run_builder.py b/python/ray/rllib/utils/tf_run_builder.py index 6512fc85c..030642ae5 100644 --- a/python/ray/rllib/utils/tf_run_builder.py +++ b/python/ray/rllib/utils/tf_run_builder.py @@ -64,18 +64,19 @@ def run_timeline(sess, ops, debug_name, feed_dict={}, timeline_dir=None): run_metadata = tf.RunMetadata() start = time.time() fetches = sess.run( - ops, options=run_options, run_metadata=run_metadata, + ops, + options=run_options, + run_metadata=run_metadata, feed_dict=feed_dict) trace = timeline.Timeline(step_stats=run_metadata.step_stats) global _count outf = os.path.join( - timeline_dir, - "timeline-{}-{}-{}.json".format(debug_name, os.getpid(), _count)) + timeline_dir, "timeline-{}-{}-{}.json".format( + debug_name, os.getpid(), _count)) _count += 1 trace_file = open(outf, "w") - print( - "Wrote tf timeline ({} s) to {}".format( - time.time() - start, os.path.abspath(outf))) + print("Wrote tf timeline ({} s) to {}".format(time.time() - start, + os.path.abspath(outf))) trace_file.write(trace.generate_chrome_trace_format()) else: fetches = sess.run(ops, feed_dict=feed_dict) diff --git a/python/ray/rllib/utils/window_stat.py b/python/ray/rllib/utils/window_stat.py index ed1d99c46..21c93069a 100644 --- a/python/ray/rllib/utils/window_stat.py +++ b/python/ray/rllib/utils/window_stat.py @@ -22,8 +22,8 @@ class WindowStat(object): if not self.count: quantiles = [] else: - quantiles = np.percentile( - self.items[:self.count], [0, 10, 50, 90, 100]).tolist() + quantiles = np.percentile(self.items[:self.count], + [0, 10, 50, 90, 100]).tolist() return { self.name + "_count": int(self.count), self.name + "_mean": float(np.mean(self.items[:self.count])),