This commit is contained in:
wassname
2021-01-09 20:22:39 +08:00
parent 4248a88ea4
commit a2c113d754
6 changed files with 90 additions and 158 deletions
+3 -2
View File
@@ -1,8 +1,9 @@
python=/home/wassname/anaconda/envs/diy-gym2/bin/python
date=2021-01-03_13-30-07
run:
${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 2
${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 2 --load models/2021-01-05_07-41-16_SAC_ApplePick-v0_Gaussian_autotune
play:
${python} play.py --load-actor models/actor_${date}_SAC_ApplePick-v0_Gaussian_autotune.pkl --load-critic models/critic_${date}_SAC_ApplePick-v0_Gaussian_autotune.pkl --render
# ${python} play.py --load-actor models/actor_${date}_SAC_ApplePick-v0_Gaussian_autotune.pkl --load-critic models/critic_${date}_SAC_ApplePick-v0_Gaussian_autotune.pkl --render
${python} main.py --load auto --render --num_steps 0 --no-train
+5
View File
@@ -1,3 +1,8 @@
Modified for wassname's apple gym
make run
make play
### Description
------------
Reimplementation of [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf) and a deterministic variant of SAC from [Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement
+80 -61
View File
@@ -4,6 +4,7 @@ import gym
import numpy as np
import itertools
from pathlib import Path
import logging
import torch
from sac import SAC
from torch.utils.tensorboard import SummaryWriter
@@ -13,49 +14,56 @@ import apple_gym.env
import pickle
from tqdm.auto import tqdm
parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args')
parser.add_argument('-e', '--env-name', default="ApplePick-v0",
help='Mujoco Gym environment (default: ApplePick-v0)')
parser.add_argument('--policy', default="Gaussian",
help='Policy Type: Gaussian | Deterministic (default: Gaussian)')
parser.add_argument('--eval', type=bool, default=True,
help='Evaluates a policy a policy every 10 episode (default: True)')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
help='discount factor for reward (default: 0.99)')
parser.add_argument('--tau', type=float, default=0.005, metavar='G',
help='target smoothing coefficient(τ) (default: 0.005)')
parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
help='learning rate (default: 0.0003)')
parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
help='Temperature parameter α determines the relative importance of the entropy\
term against the reward (default: 0.2)')
parser.add_argument('--automatic_entropy_tuning', type=bool, default=True, metavar='G',
help='Automaically adjust α (default: True)')
parser.add_argument('--seed', type=int, default=123456, metavar='N',
help='random seed (default: 123456)')
parser.add_argument('--batch_size', type=int, default=256, metavar='N',
help='batch size (default: 256)')
parser.add_argument('--num_steps', type=int, default=1000001, metavar='N',
help='maximum number of steps (default: 1000000)')
parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
help='hidden size (default: 256)')
parser.add_argument('--updates_per_step', type=int, default=1, metavar='N',
help='model updates per simulator step (default: 1)')
parser.add_argument('--start_steps', type=int, default=10000, metavar='N',
help='Steps sampling random actions (default: 10000)')
parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
help='Value target update per no. of updates per step (default: 1)')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N',
help='size of replay buffer (default: 10000000)')
parser.add_argument('--cuda', action="store_true",
help='run on CUDA (default: False)')
parser.add_argument('--demonstrations', default=False,
help='Load demonstrations from https://github.com/erfanMhi/gym-recording-modified')
parser.add_argument('-l', '--load', default=False,
help='Load models')
parser.add_argument('-r', '--render', action="store_true",
help='show')
args = parser.parse_args()
def get_args():
parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args')
parser.add_argument('-e', '--env-name', default="ApplePick-v0",
help='Mujoco Gym environment (default: ApplePick-v0)')
parser.add_argument('--policy', default="Gaussian",
help='Policy Type: Gaussian | Deterministic (default: Gaussian)')
parser.add_argument('--eval', type=bool, default=True,
help='Evaluates a policy a policy every 10 episode (default: True)')
parser.add_argument('--no-train', dest='train', action='store_false')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
help='discount factor for reward (default: 0.99)')
parser.add_argument('--tau', type=float, default=0.005, metavar='G',
help='target smoothing coefficient(τ) (default: 0.005)')
parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
help='learning rate (default: 0.0003)')
parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
help='Temperature parameter α determines the relative importance of the entropy\
term against the reward (default: 0.2)')
parser.add_argument('--automatic_entropy_tuning', type=bool, default=True, metavar='G',
help='Automaically adjust α (default: True)')
parser.add_argument('--seed', type=int, default=123456, metavar='N',
help='random seed (default: 123456)')
parser.add_argument('--batch_size', type=int, default=256, metavar='N',
help='batch size (default: 256)')
parser.add_argument('--num_steps', type=int, default=1000001, metavar='N',
help='maximum number of steps (default: 1000000)')
parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
help='hidden size (default: 256)')
parser.add_argument('--updates_per_step', type=int, default=1, metavar='N',
help='model updates per simulator step (default: 1)')
parser.add_argument('--start_steps', type=int, default=10000, metavar='N',
help='Steps sampling random actions (default: 10000)')
parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
help='Value target update per no. of updates per step (default: 1)')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N',
help='size of replay buffer (default: 10000000)')
parser.add_argument('--cuda', action="store_true",
help='run on CUDA (default: False)')
parser.add_argument('--demonstrations', default=False,
help='Load demonstrations from https://github.com/erfanMhi/gym-recording-modified')
parser.add_argument('-l', '--load', default=False,
help='Load models')
parser.add_argument('-r', '--render', action="store_true",
help='show')
args = parser.parse_args()
return args
args = get_args()
print(args)
# Environment
# env = NormalizedActions(gym.make(args.env_name))
@@ -81,32 +89,42 @@ memory=ReplayMemory(args.replay_size, args.seed)
def save(save_dir):
save_dir.mkdir(exist_ok=True)
agent.save_model(save_dir/'actor.pkl', save_dir/'critic.pkl')
memory.save(save_dir/'memory.pkl')
try:
save_dir.mkdir(exist_ok=True)
print(f'Saving to {save_dir}')
agent.save_model(save_dir/'actor.pkl', save_dir/'critic.pkl')
# memory.save(save_dir / 'memory.pkl')
except Exception as e:
logging.exception("failed to save")
def load(save_dir):
agent.load_model(save_dir/'actor.pkl', save_dir/'critic.pkl')
memory.load(save_dir/'memory.pkl')
agent.load_model(save_dir / 'actor.pkl', save_dir / 'critic.pkl')
# if args.train:
# memory.load(save_dir/'memory.pkl')
if args.load:
load(args.load)
if args.load=='auto':
args.load = sorted(Path('models').glob('*/actor*'))[-1].parent
print(f'auto loading {args.load}')
load(Path(args.load))
print(f"memory {len(memory)} after load")
if args.demonstrations:
load_demonstrations(memory, args.demonstrations)
print(f"memory {len(memory)} after demonstrations")
# Training Loop
total_numsteps = 0
updates = 0
with tqdm(unit='steps', mininterval=5) as prog:
for i_episode in itertools.count(1):
for i_episode in itertools.count(0):
episode_reward = 0
episode_steps = 0
done = False
state = env.reset()
for i_step in itertools.count(1):
while (not done) and args.train:
if args.start_steps > total_numsteps:
action = env.action_space.sample() # Sample random action
else:
@@ -134,24 +152,21 @@ with tqdm(unit='steps', mininterval=5) as prog:
# log env stuff
for k in ['env_reward/apple_pick/tree/min_fruit_dist_reward',
'env_reward/apple_pick/tree/gripping_fruit_reward',
'env_reward/apple_pick/tree/force_tree_reward',
'env_reward/apple_pick/tree/force_fruit_reward']:
writer.add_scalar(k, info[k], episode_steps)
'env_reward/apple_pick/tree/gripping_fruit_reward',
'env_reward/apple_pick/tree/force_tree_reward',
'env_reward/apple_pick/tree/force_fruit_reward']:
writer.add_scalar(k, info[k], total_numsteps)
# Ignore the "done" signal if it comes from hitting the time horizon. (that is, when it's an artificial terminal signal that isn't based on the agent's state)
# (https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/sac/sac.py)
mask = 1 if episode_steps == env._max_episode_steps else float(not done)
memory.push(state, action, reward, next_state, mask) # Append transition to memory
memory.push(state, action, reward, next_state, mask) # Append transition to memory
state = next_state
if total_numsteps > args.num_steps:
break
writer.add_scalar('reward/train', episode_reward, i_episode)
print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2)))
print("\nEpisode: {}, total numsteps: {}, episode steps: {}, reward: {}, updates: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2), updates))
prog.desc = "e: {}, r: {}, u: {}, m: {}".format(i_episode, round(episode_reward, 2), updates, len(memory))
if i_episode % 10 == 0 and args.eval is True:
avg_reward = 0.
@@ -180,5 +195,9 @@ with tqdm(unit='steps', mininterval=5) as prog:
print("Test Episodes: {}, Avg. Reward: {}".format(episodes, round(avg_reward, 2)))
print("----------------------------------------")
if total_numsteps >= args.num_steps:
break
env.close()
save(save_dir)
-94
View File
@@ -1,94 +0,0 @@
import argparse
import datetime
import gym
import numpy as np
import itertools
import torch
from sac import SAC
from tqdm.auto import tqdm
import apple_gym.env
import pickle
parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args')
parser.add_argument('-e', '--env-name', default="ApplePick-v0",
help='Mujoco Gym environment (default: ApplePick-v0)')
parser.add_argument('--policy', default="Gaussian",
help='Policy Type: Gaussian | Deterministic (default: Gaussian)')
parser.add_argument('--eval', type=bool, default=True,
help='Evaluates a policy a policy every 10 episode (default: True)')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
help='discount factor for reward (default: 0.99)')
parser.add_argument('--tau', type=float, default=0.005, metavar='G',
help='target smoothing coefficient(τ) (default: 0.005)')
parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
help='learning rate (default: 0.0003)')
parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
help='Temperature parameter α determines the relative importance of the entropy\
term against the reward (default: 0.2)')
parser.add_argument('--automatic_entropy_tuning', type=bool, default=True, metavar='G',
help='Automaically adjust α (default: True)')
parser.add_argument('--seed', type=int, default=123456, metavar='N',
help='random seed (default: 123456)')
parser.add_argument('--batch_size', type=int, default=256, metavar='N',
help='batch size (default: 256)')
parser.add_argument('--num_steps', type=int, default=1000001, metavar='N',
help='maximum number of steps (default: 1000000)')
parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
help='hidden size (default: 256)')
parser.add_argument('--updates_per_step', type=int, default=1, metavar='N',
help='model updates per simulator step (default: 1)')
parser.add_argument('--start_steps', type=int, default=10000, metavar='N',
help='Steps sampling random actions (default: 10000)')
parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
help='Value target update per no. of updates per step (default: 1)')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N',
help='size of replay buffer (default: 10000000)')
parser.add_argument('--cuda', action="store_true",
help='run on CUDA (default: False)')
parser.add_argument('--demonstrations', default=False,
help='Load demonstrations from https://github.com/erfanMhi/gym-recording-modified')
parser.add_argument('-l', '--load', default=False,
help='Load models')
parser.add_argument('-r', '--render', action="store_true",
help='show')
parser.add_argument('--load-actor', type=str, help='e.g. models/actor_2021-01-02_10-26-23_SAC_ApplePick-v0_Gaussian_autotune.pkl')
parser.add_argument('--load-critic', type=str, help='e.g. models/critic_2021-01-02_10-26-23_SAC_ApplePick-v0_Gaussian_autotune.pkl')
args = parser.parse_args()
torch.manual_seed(args.seed)
np.random.seed(args.seed)
# Environment
# env = NormalizedActions(gym.make(args.env_name))
env = gym.make(args.env_name, render=args.render)
env.seed(args.seed)
env.action_space.seed(args.seed)
# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)
agent.load_model(args.load_actor, args.load_critic)
# Test
avg_reward = 0.
episodes = 10
for _ in tqdm(range(episodes)):
state = env.reset()
episode_reward = 0
done = False
while not done:
action = agent.select_action(state, evaluate=True)
next_state, reward, done, _ = env.step(action)
episode_reward += reward
state = next_state
avg_reward += episode_reward
avg_reward /= episodes
print("----------------------------------------")
print("Test Episodes: {}, Avg. Reward: {}".format(episodes, round(avg_reward, 2)))
print("----------------------------------------")
env.close()
+2
View File
@@ -32,4 +32,6 @@ class ReplayMemory:
def load(self, memory_path):
print('Loading memory from {}'.format(memory_path))
if memory_path is not None:
# print(self.buffer[0])
self.buffer = torch.load(memory_path)
self.position = len(self.buffer)
-1
View File
@@ -105,7 +105,6 @@ class SAC(object):
# Save model parameters
def save_model(self, actor_path=None, critic_path=None):
print('Saving models to {} and {}'.format(actor_path, critic_path))
torch.save(self.policy.state_dict(), actor_path)
torch.save(self.critic.state_dict(), critic_path)