apple gym runs

This commit is contained in:
wassname
2020-12-29 08:53:19 +08:00
parent 10c6b6e595
commit 617ff797ba
6 changed files with 167 additions and 57 deletions
+1
View File
@@ -1,2 +1,3 @@
__pycache__/
runs/
data
+2
View File
@@ -0,0 +1,2 @@
run:
python main.py --demonstrations data/demonstrations
+1 -1
View File
@@ -11,7 +11,7 @@ from pathlib import Path
def load_demonstrations(mem: ReplayMemory, recordings: Path):
records = get_recordings(str(recordings))
ends=records["episodes_end_point"]
for i in tqdm(range(len(ends))-1, desc='loading demonstrations'):
for i in tqdm(range(len(ends)-1), desc='loading demonstrations'):
a = ends[i]
b = ends[i+1]
for s in range(a+1, b):
+70 -56
View File
@@ -9,6 +9,8 @@ from torch.utils.tensorboard import SummaryWriter
from replay_memory import ReplayMemory
from load_demonstrations import load_demonstrations
import apple_gym.env
import pickle
from tqdm.auto import tqdm
parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args')
parser.add_argument('--env-name', default="ApplePick-v0",
@@ -52,7 +54,7 @@ args = parser.parse_args()
# Environment
# env = NormalizedActions(gym.make(args.env_name))
env = gym.make(args.env_name)
env = gym.make(args.env_name, render=False)
env.seed(args.seed)
env.action_space.seed(args.seed)
@@ -63,86 +65,98 @@ np.random.seed(args.seed)
agent = SAC(env.observation_space.shape[0], env.action_space, args)
#Tesnorboard
writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
args.policy, "autotune" if args.automatic_entropy_tuning else ""))
log_name = '{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
args.policy, "autotune" if args.automatic_entropy_tuning else "")
writer = SummaryWriter('runs/' + log_name)
# Memory
memory=ReplayMemory(args.replay_size, args.seed)
if args.demonstrations:
load_demonstrations(memory, args.demonstrations)
def save():
agent.save_model(args.env_name, "", "models/actor_" + log_name+'.pkl', "models/critic_"+log_name+'.pkl')
memory.save(args.env_name, "", "models/memory_" + log_name +'.pkl')
# agent.load_model("models/actor_" + log_name + '.pkl', "models/critic_" + log_name + '.pkl')
# memory.load("models/memory_" + log_name +'.pkl')
# Training Loop
total_numsteps = 0
updates = 0
for i_episode in itertools.count(1):
episode_reward = 0
episode_steps = 0
done = False
state = env.reset()
with tqdm(unit='frames') as prog:
for i_episode in itertools.count(1):
episode_reward = 0
episode_steps = 0
done = False
state = env.reset()
while not done:
if args.start_steps > total_numsteps:
action = env.action_space.sample() # Sample random action
else:
action = agent.select_action(state) # Sample action from policy
while not done:
if args.start_steps > total_numsteps:
action = env.action_space.sample() # Sample random action
else:
action = agent.select_action(state) # Sample action from policy
if len(memory) > args.batch_size:
# Number of updates per step in environment
for i in range(args.updates_per_step):
# Update parameters of all the networks
critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, args.batch_size, updates)
if len(memory) > args.batch_size:
# Number of updates per step in environment
for i in range(args.updates_per_step):
# Update parameters of all the networks
critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, args.batch_size, updates)
writer.add_scalar('loss/critic_1', critic_1_loss, updates)
writer.add_scalar('loss/critic_2', critic_2_loss, updates)
writer.add_scalar('loss/policy', policy_loss, updates)
writer.add_scalar('loss/entropy_loss', ent_loss, updates)
writer.add_scalar('entropy_temprature/alpha', alpha, updates)
updates += 1
writer.add_scalar('loss/critic_1', critic_1_loss, updates)
writer.add_scalar('loss/critic_2', critic_2_loss, updates)
writer.add_scalar('loss/policy', policy_loss, updates)
writer.add_scalar('loss/entropy_loss', ent_loss, updates)
writer.add_scalar('entropy_temprature/alpha', alpha, updates)
updates += 1
next_state, reward, done, _ = env.step(action) # Step
episode_steps += 1
total_numsteps += 1
episode_reward += reward
next_state, reward, done, _ = env.step(action) # Step
episode_steps += 1
total_numsteps += 1
episode_reward += reward
prog.update(1)
prog.desc = f'er={episode_reward/episode_steps:2.2f}'
# Ignore the "done" signal if it comes from hitting the time horizon.
# (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
mask = 1 if episode_steps == env._max_episode_steps else float(not done)
# Ignore the "done" signal if it comes from hitting the time horizon.
# (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
mask = 1 if episode_steps == env._max_episode_steps else float(not done)
memory.push(state, action, reward, next_state, mask) # Append transition to memory
memory.push(state, action, reward, next_state, mask) # Append transition to memory
state = next_state
state = next_state
if total_numsteps > args.num_steps:
break
if total_numsteps > args.num_steps:
break
writer.add_scalar('reward/train', episode_reward, i_episode)
print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2)))
writer.add_scalar('reward/train', episode_reward, i_episode)
print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2)))
if i_episode % 10 == 0 and args.eval is True:
avg_reward = 0.
episodes = 10
for _ in range(episodes):
state = env.reset()
episode_reward = 0
done = False
while not done:
action = agent.select_action(state, evaluate=True)
if i_episode % 10 == 0 and args.eval is True:
avg_reward = 0.
episodes = 10
for _ in range(episodes):
state = env.reset()
episode_reward = 0
done = False
while not done:
action = agent.select_action(state, evaluate=True)
next_state, reward, done, _ = env.step(action)
episode_reward += reward
next_state, reward, done, _ = env.step(action)
episode_reward += reward
state = next_state
avg_reward += episode_reward
avg_reward /= episodes
state = next_state
avg_reward += episode_reward
avg_reward /= episodes
writer.add_scalar('avg_reward/test', avg_reward, i_episode)
writer.add_scalar('avg_reward/test', avg_reward, i_episode)
print("----------------------------------------")
print("Test Episodes: {}, Avg. Reward: {}".format(episodes, round(avg_reward, 2)))
print("----------------------------------------")
save()
print("----------------------------------------")
print("Test Episodes: {}, Avg. Reward: {}".format(episodes, round(avg_reward, 2)))
print("----------------------------------------")
env.close()
save()
+77
View File
@@ -0,0 +1,77 @@
import argparse
import datetime
import gym
import numpy as np
import itertools
import torch
from sac import SAC
from torch.utils.tensorboard import SummaryWriter
from replay_memory import ReplayMemory
from load_demonstrations import load_demonstrations
import apple_gym.env
import pickle
parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args')
parser.add_argument('--env-name', default="ApplePick-v0",
help='Mujoco Gym environment (default: ApplePick-v0)')
parser.add_argument('--policy', default="Gaussian",
help='Policy Type: Gaussian | Deterministic (default: Gaussian)')
parser.add_argument('--eval', type=bool, default=True,
help='Evaluates a policy a policy every 10 episode (default: True)')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
help='discount factor for reward (default: 0.99)')
parser.add_argument('--tau', type=float, default=0.005, metavar='G',
help='target smoothing coefficient(τ) (default: 0.005)')
parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
help='learning rate (default: 0.0003)')
parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
help='Temperature parameter α determines the relative importance of the entropy\
term against the reward (default: 0.2)')
parser.add_argument('--automatic_entropy_tuning', type=bool, default=False, metavar='G',
help='Automaically adjust α (default: False)')
parser.add_argument('--seed', type=int, default=123456, metavar='N',
help='random seed (default: 123456)')
parser.add_argument('--batch_size', type=int, default=256, metavar='N',
help='batch size (default: 256)')
parser.add_argument('--num_steps', type=int, default=1000001, metavar='N',
help='maximum number of steps (default: 1000000)')
parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
help='hidden size (default: 256)')
parser.add_argument('--updates_per_step', type=int, default=1, metavar='N',
help='model updates per simulator step (default: 1)')
parser.add_argument('--start_steps', type=int, default=10000, metavar='N',
help='Steps sampling random actions (default: 10000)')
parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
help='Value target update per no. of updates per step (default: 1)')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N',
help='size of replay buffer (default: 10000000)')
parser.add_argument('--cuda', action="store_true",
help='run on CUDA (default: False)')
parser.add_argument('--demonstrations', default=False,
help='Load demonstrations from https://github.com/erfanMhi/gym-recording-modified')
args = parser.parse_args()
# Environment
# env = NormalizedActions(gym.make(args.env_name))
env = gym.make(args.env_name, render=True)
env.seed(args.seed)
env.action_space.seed(args.seed)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)
#Tesnorboard
log_name = '{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
args.policy, "autotune" if args.automatic_entropy_tuning else "")
writer = SummaryWriter('runs/' + log_name)
# Memory
memory=ReplayMemory(args.replay_size, args.seed)
if args.demonstrations:
load_demonstrations(memory, args.demonstrations)
agent.load_model("models/actor_" + log_name + '.pkl', "models/critic_" + log_name + '.pkl')
memory.load("models/memory_" + log_name +'.pkl')
+16
View File
@@ -1,5 +1,7 @@
import random
import numpy as np
import pickle
import os
class ReplayMemory:
def __init__(self, capacity, seed):
@@ -21,3 +23,17 @@ class ReplayMemory:
def __len__(self):
return len(self.buffer)
def save(self, env_name, suffix="", memory_path=None):
if not os.path.exists('models/'):
os.makedirs('models/')
if memory_path is None:
memory_path = "models/memory_buffer_{}_{}".format(env_name, suffix)
print('Saving memory to {}'.format(memory_path))
pickle.dump(self.buffer, open(memory_path, 'wb'))
def load(self, memory_path):
print('Loading memory from {}'.format(memory_path))
if memory_path is not None:
self.buffer = pickle.load(open(memory_path, 'rb'))