From 617ff797ba4fbed4014b786f805163514955865b Mon Sep 17 00:00:00 2001 From: wassname Date: Tue, 29 Dec 2020 08:53:19 +0800 Subject: [PATCH] apple gym runs --- .gitignore | 1 + Makefile | 2 + load_demonstrations.py | 2 +- main.py | 126 +++++++++++++++++++++++------------------ play.py | 77 +++++++++++++++++++++++++ replay_memory.py | 16 ++++++ 6 files changed, 167 insertions(+), 57 deletions(-) create mode 100644 Makefile create mode 100644 play.py diff --git a/.gitignore b/.gitignore index 93cd52f..8f5187c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ __pycache__/ runs/ +data diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d4600bc --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +run: + python main.py --demonstrations data/demonstrations diff --git a/load_demonstrations.py b/load_demonstrations.py index ef5fdac..7674dce 100644 --- a/load_demonstrations.py +++ b/load_demonstrations.py @@ -11,7 +11,7 @@ from pathlib import Path def load_demonstrations(mem: ReplayMemory, recordings: Path): records = get_recordings(str(recordings)) ends=records["episodes_end_point"] - for i in tqdm(range(len(ends))-1, desc='loading demonstrations'): + for i in tqdm(range(len(ends)-1), desc='loading demonstrations'): a = ends[i] b = ends[i+1] for s in range(a+1, b): diff --git a/main.py b/main.py index 6ed0bb3..66e712b 100644 --- a/main.py +++ b/main.py @@ -9,6 +9,8 @@ from torch.utils.tensorboard import SummaryWriter from replay_memory import ReplayMemory from load_demonstrations import load_demonstrations import apple_gym.env +import pickle +from tqdm.auto import tqdm parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args') parser.add_argument('--env-name', default="ApplePick-v0", @@ -52,7 +54,7 @@ args = parser.parse_args() # Environment # env = NormalizedActions(gym.make(args.env_name)) -env = gym.make(args.env_name) +env = gym.make(args.env_name, render=False) env.seed(args.seed) env.action_space.seed(args.seed) @@ -63,86 +65,98 @@ np.random.seed(args.seed) agent = SAC(env.observation_space.shape[0], env.action_space, args) #Tesnorboard -writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, - args.policy, "autotune" if args.automatic_entropy_tuning else "")) +log_name = '{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, + args.policy, "autotune" if args.automatic_entropy_tuning else "") +writer = SummaryWriter('runs/' + log_name) # Memory memory=ReplayMemory(args.replay_size, args.seed) if args.demonstrations: load_demonstrations(memory, args.demonstrations) +def save(): + agent.save_model(args.env_name, "", "models/actor_" + log_name+'.pkl', "models/critic_"+log_name+'.pkl') + memory.save(args.env_name, "", "models/memory_" + log_name +'.pkl') + # agent.load_model("models/actor_" + log_name + '.pkl', "models/critic_" + log_name + '.pkl') + # memory.load("models/memory_" + log_name +'.pkl') + # Training Loop total_numsteps = 0 updates = 0 -for i_episode in itertools.count(1): - episode_reward = 0 - episode_steps = 0 - done = False - state = env.reset() +with tqdm(unit='frames') as prog: + for i_episode in itertools.count(1): + episode_reward = 0 + episode_steps = 0 + done = False + state = env.reset() - while not done: - if args.start_steps > total_numsteps: - action = env.action_space.sample() # Sample random action - else: - action = agent.select_action(state) # Sample action from policy + while not done: + if args.start_steps > total_numsteps: + action = env.action_space.sample() # Sample random action + else: + action = agent.select_action(state) # Sample action from policy - if len(memory) > args.batch_size: - # Number of updates per step in environment - for i in range(args.updates_per_step): - # Update parameters of all the networks - critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, args.batch_size, updates) + if len(memory) > args.batch_size: + # Number of updates per step in environment + for i in range(args.updates_per_step): + # Update parameters of all the networks + critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, args.batch_size, updates) - writer.add_scalar('loss/critic_1', critic_1_loss, updates) - writer.add_scalar('loss/critic_2', critic_2_loss, updates) - writer.add_scalar('loss/policy', policy_loss, updates) - writer.add_scalar('loss/entropy_loss', ent_loss, updates) - writer.add_scalar('entropy_temprature/alpha', alpha, updates) - updates += 1 + writer.add_scalar('loss/critic_1', critic_1_loss, updates) + writer.add_scalar('loss/critic_2', critic_2_loss, updates) + writer.add_scalar('loss/policy', policy_loss, updates) + writer.add_scalar('loss/entropy_loss', ent_loss, updates) + writer.add_scalar('entropy_temprature/alpha', alpha, updates) + updates += 1 - next_state, reward, done, _ = env.step(action) # Step - episode_steps += 1 - total_numsteps += 1 - episode_reward += reward + next_state, reward, done, _ = env.step(action) # Step + episode_steps += 1 + total_numsteps += 1 + episode_reward += reward + prog.update(1) + prog.desc = f'er={episode_reward/episode_steps:2.2f}' - # Ignore the "done" signal if it comes from hitting the time horizon. - # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py) - mask = 1 if episode_steps == env._max_episode_steps else float(not done) + # Ignore the "done" signal if it comes from hitting the time horizon. + # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py) + mask = 1 if episode_steps == env._max_episode_steps else float(not done) - memory.push(state, action, reward, next_state, mask) # Append transition to memory + memory.push(state, action, reward, next_state, mask) # Append transition to memory - state = next_state + state = next_state - if total_numsteps > args.num_steps: - break + if total_numsteps > args.num_steps: + break - writer.add_scalar('reward/train', episode_reward, i_episode) - print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2))) + writer.add_scalar('reward/train', episode_reward, i_episode) + print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2))) - if i_episode % 10 == 0 and args.eval is True: - avg_reward = 0. - episodes = 10 - for _ in range(episodes): - state = env.reset() - episode_reward = 0 - done = False - while not done: - action = agent.select_action(state, evaluate=True) + if i_episode % 10 == 0 and args.eval is True: + avg_reward = 0. + episodes = 10 + for _ in range(episodes): + state = env.reset() + episode_reward = 0 + done = False + while not done: + action = agent.select_action(state, evaluate=True) - next_state, reward, done, _ = env.step(action) - episode_reward += reward + next_state, reward, done, _ = env.step(action) + episode_reward += reward - state = next_state - avg_reward += episode_reward - avg_reward /= episodes + state = next_state + avg_reward += episode_reward + avg_reward /= episodes - writer.add_scalar('avg_reward/test', avg_reward, i_episode) + writer.add_scalar('avg_reward/test', avg_reward, i_episode) - print("----------------------------------------") - print("Test Episodes: {}, Avg. Reward: {}".format(episodes, round(avg_reward, 2))) - print("----------------------------------------") + save() + + print("----------------------------------------") + print("Test Episodes: {}, Avg. Reward: {}".format(episodes, round(avg_reward, 2))) + print("----------------------------------------") env.close() - +save() diff --git a/play.py b/play.py new file mode 100644 index 0000000..f99ab55 --- /dev/null +++ b/play.py @@ -0,0 +1,77 @@ +import argparse +import datetime +import gym +import numpy as np +import itertools +import torch +from sac import SAC +from torch.utils.tensorboard import SummaryWriter +from replay_memory import ReplayMemory +from load_demonstrations import load_demonstrations +import apple_gym.env +import pickle + +parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args') +parser.add_argument('--env-name', default="ApplePick-v0", + help='Mujoco Gym environment (default: ApplePick-v0)') +parser.add_argument('--policy', default="Gaussian", + help='Policy Type: Gaussian | Deterministic (default: Gaussian)') +parser.add_argument('--eval', type=bool, default=True, + help='Evaluates a policy a policy every 10 episode (default: True)') +parser.add_argument('--gamma', type=float, default=0.99, metavar='G', + help='discount factor for reward (default: 0.99)') +parser.add_argument('--tau', type=float, default=0.005, metavar='G', + help='target smoothing coefficient(τ) (default: 0.005)') +parser.add_argument('--lr', type=float, default=0.0003, metavar='G', + help='learning rate (default: 0.0003)') +parser.add_argument('--alpha', type=float, default=0.2, metavar='G', + help='Temperature parameter α determines the relative importance of the entropy\ + term against the reward (default: 0.2)') +parser.add_argument('--automatic_entropy_tuning', type=bool, default=False, metavar='G', + help='Automaically adjust α (default: False)') +parser.add_argument('--seed', type=int, default=123456, metavar='N', + help='random seed (default: 123456)') +parser.add_argument('--batch_size', type=int, default=256, metavar='N', + help='batch size (default: 256)') +parser.add_argument('--num_steps', type=int, default=1000001, metavar='N', + help='maximum number of steps (default: 1000000)') +parser.add_argument('--hidden_size', type=int, default=256, metavar='N', + help='hidden size (default: 256)') +parser.add_argument('--updates_per_step', type=int, default=1, metavar='N', + help='model updates per simulator step (default: 1)') +parser.add_argument('--start_steps', type=int, default=10000, metavar='N', + help='Steps sampling random actions (default: 10000)') +parser.add_argument('--target_update_interval', type=int, default=1, metavar='N', + help='Value target update per no. of updates per step (default: 1)') +parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', + help='size of replay buffer (default: 10000000)') +parser.add_argument('--cuda', action="store_true", + help='run on CUDA (default: False)') +parser.add_argument('--demonstrations', default=False, + help='Load demonstrations from https://github.com/erfanMhi/gym-recording-modified') +args = parser.parse_args() + +# Environment +# env = NormalizedActions(gym.make(args.env_name)) +env = gym.make(args.env_name, render=True) +env.seed(args.seed) +env.action_space.seed(args.seed) + +torch.manual_seed(args.seed) +np.random.seed(args.seed) + +# Agent +agent = SAC(env.observation_space.shape[0], env.action_space, args) + +#Tesnorboard +log_name = '{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, + args.policy, "autotune" if args.automatic_entropy_tuning else "") +writer = SummaryWriter('runs/' + log_name) + +# Memory +memory=ReplayMemory(args.replay_size, args.seed) +if args.demonstrations: + load_demonstrations(memory, args.demonstrations) + +agent.load_model("models/actor_" + log_name + '.pkl', "models/critic_" + log_name + '.pkl') +memory.load("models/memory_" + log_name +'.pkl') diff --git a/replay_memory.py b/replay_memory.py index 5152063..560b7b8 100644 --- a/replay_memory.py +++ b/replay_memory.py @@ -1,5 +1,7 @@ import random import numpy as np +import pickle +import os class ReplayMemory: def __init__(self, capacity, seed): @@ -21,3 +23,17 @@ class ReplayMemory: def __len__(self): return len(self.buffer) + + def save(self, env_name, suffix="", memory_path=None): + if not os.path.exists('models/'): + os.makedirs('models/') + + if memory_path is None: + memory_path = "models/memory_buffer_{}_{}".format(env_name, suffix) + print('Saving memory to {}'.format(memory_path)) + pickle.dump(self.buffer, open(memory_path, 'wb')) + + def load(self, memory_path): + print('Loading memory from {}'.format(memory_path)) + if memory_path is not None: + self.buffer = pickle.load(open(memory_path, 'rb'))