mirror of
https://github.com/wassname/pytorch-soft-actor-critic.git
synced 2026-06-27 16:46:28 +08:00
progbar
This commit is contained in:
@@ -1,8 +1,12 @@
|
|||||||
python=/home/wassname/anaconda/envs/diygym3/bin/python
|
python=/home/wassname/anaconda/envs/diygym3/bin/python
|
||||||
date=2021-01-03_13-30-07
|
date=2021-01-03_13-30-07
|
||||||
LOGURU_LEVEL=INFO
|
LOGURU_LEVEL=INFO
|
||||||
|
# ulimit -S -m 35000000
|
||||||
|
# ulimit -S -v 35000000
|
||||||
|
|
||||||
run:
|
run:
|
||||||
LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 4 --automatic_entropy_tuning true
|
LOGURU_LEVEL=INFO ${python} -m pdb main.py --cuda --automatic_entropy_tuning true --replay_size 15000 --load auto
|
||||||
|
# LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --automatic_entropy_tuning true --replay_size 20000 --load auto
|
||||||
# LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 2 --load auto --alpha 0.1 --tau 1 --target_update_interval 1000
|
# LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 2 --load auto --alpha 0.1 --tau 1 --target_update_interval 1000
|
||||||
# LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 2 --load auto --tau 1 --target_update_interval 1000 --policy Deterministic
|
# LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 2 --load auto --tau 1 --target_update_interval 1000 --policy Deterministic
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,10 @@
|
|||||||
Modified for wassname's apple gym
|
Modified for wassname's apple gym
|
||||||
|
|
||||||
|
changes:
|
||||||
|
- save
|
||||||
|
- process_obs with grconvnet
|
||||||
|
- logging
|
||||||
|
|
||||||
make run
|
make run
|
||||||
make play
|
make play
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ from pathlib import Path
|
|||||||
import logging
|
import logging
|
||||||
import torch
|
import torch
|
||||||
from sac import SAC
|
from sac import SAC
|
||||||
from torch.utils.tensorboard import SummaryWriter
|
|
||||||
from replay_memory import ReplayMemory
|
from replay_memory import ReplayMemory
|
||||||
from load_demonstrations import load_demonstrations
|
from load_demonstrations import load_demonstrations
|
||||||
import apple_gym.env
|
import apple_gym.env
|
||||||
@@ -15,11 +14,25 @@ import pickle
|
|||||||
from process_obs import ProcessObservation
|
from process_obs import ProcessObservation
|
||||||
# from torchinfo import summary
|
# from torchinfo import summary
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from rich import print
|
from rich import print
|
||||||
from rich.logging import RichHandler
|
from rich.logging import RichHandler
|
||||||
|
from rich.progress import (
|
||||||
|
ProgressColumn,
|
||||||
|
BarColumn,
|
||||||
|
DownloadColumn,
|
||||||
|
TextColumn,
|
||||||
|
TransferSpeedColumn,
|
||||||
|
TimeRemainingColumn,
|
||||||
|
Progress,
|
||||||
|
TaskID,
|
||||||
|
TimeElapsedColumn,
|
||||||
|
SpinnerColumn,
|
||||||
|
Text
|
||||||
|
)
|
||||||
logging.basicConfig(level=logging.INFO, handlers=[RichHandler(rich_tracebacks=True, markup=True)])
|
logging.basicConfig(level=logging.INFO, handlers=[RichHandler(rich_tracebacks=True, markup=True)])
|
||||||
logger.configure(handlers=[{"sink": RichHandler(markup=True),
|
logger.configure(handlers=[{"sink": RichHandler(markup=True),
|
||||||
"format": "{message}"}])
|
"format": "{message}"}])
|
||||||
@@ -89,13 +102,15 @@ torch.manual_seed(args.seed)
|
|||||||
np.random.seed(args.seed)
|
np.random.seed(args.seed)
|
||||||
|
|
||||||
# A visual network
|
# A visual network
|
||||||
observation_space=env.observation_space.shape[0]
|
action_dim = env.action_space.shape[0]
|
||||||
|
observation_dim=env.observation_space.shape[0]
|
||||||
process_obs=ProcessObservation()
|
process_obs=ProcessObservation()
|
||||||
observation_space=observation_space - process_obs.reduce_action_space
|
observation_dim=observation_dim - process_obs.reduce_obs_space
|
||||||
logger.info(f"process_obs reduces obs_space {env.observation_space.shape[0]}-{process_obs.reduce_action_space}={observation_space}")
|
|
||||||
|
logger.info(f"process_obs reduces obs_space {env.observation_space.shape[0]}-{process_obs.reduce_obs_space}={observation_dim}")
|
||||||
|
|
||||||
# Agent
|
# Agent
|
||||||
agent = SAC(observation_space, env.action_space, args, process_obs)
|
agent = SAC(observation_dim, env.action_space, args, process_obs)
|
||||||
|
|
||||||
# TODO
|
# TODO
|
||||||
# summary(model, input_size=(batch_size, 1, 28, 28))
|
# summary(model, input_size=(batch_size, 1, 28, 28))
|
||||||
@@ -109,7 +124,7 @@ logger.info(f"log name {log_name}")
|
|||||||
save_dir=Path("models") / log_name
|
save_dir=Path("models") / log_name
|
||||||
|
|
||||||
# Memory
|
# Memory
|
||||||
memory=ReplayMemory(args.replay_size, args.seed)
|
memory=ReplayMemory(args.replay_size, args.seed, env.observation_space.shape[0], action_dim)
|
||||||
|
|
||||||
|
|
||||||
def save(save_dir):
|
def save(save_dir):
|
||||||
@@ -141,7 +156,32 @@ if args.demonstrations:
|
|||||||
total_numsteps = 0
|
total_numsteps = 0
|
||||||
updates = 0
|
updates = 0
|
||||||
|
|
||||||
with tqdm(unit='steps', mininterval=5) as prog:
|
class SpeedColumn(ProgressColumn):
|
||||||
|
"""Renders human readable transfer speed."""
|
||||||
|
|
||||||
|
def render(self, task: "Task") -> Text:
|
||||||
|
"""Show data transfer speed."""
|
||||||
|
speed = task.speed
|
||||||
|
if speed is None:
|
||||||
|
return Text("?", style="progress.data.speed")
|
||||||
|
return Text(f"{speed:2.2f} it/s", style="progress.data.speed")
|
||||||
|
|
||||||
|
with Progress(
|
||||||
|
SpinnerColumn(),
|
||||||
|
"[progress.description]{task.description}",
|
||||||
|
BarColumn(),
|
||||||
|
TextColumn("{task.completed}/{task.total}"),
|
||||||
|
"[",
|
||||||
|
TimeElapsedColumn(),
|
||||||
|
"<",
|
||||||
|
TimeRemainingColumn(),
|
||||||
|
',',
|
||||||
|
SpeedColumn(),
|
||||||
|
']',
|
||||||
|
refresh_per_second=1, speed_estimate_period=360
|
||||||
|
) as prog:
|
||||||
|
task1 = prog.add_task("[red]steps", total=args.num_steps)
|
||||||
|
task2 = prog.add_task("[red]updates", total=args.num_steps)
|
||||||
for i_episode in itertools.count(0):
|
for i_episode in itertools.count(0):
|
||||||
print('1')
|
print('1')
|
||||||
episode_reward = 0
|
episode_reward = 0
|
||||||
@@ -168,11 +208,12 @@ with tqdm(unit='steps', mininterval=5) as prog:
|
|||||||
writer.add_scalar('entropy_temperature/alpha', alpha, updates)
|
writer.add_scalar('entropy_temperature/alpha', alpha, updates)
|
||||||
|
|
||||||
updates += 1
|
updates += 1
|
||||||
|
prog.update(task2, advance=1)
|
||||||
|
|
||||||
next_state, reward, done, info = env.step(action) # Step
|
next_state, reward, done, info = env.step(action) # Step
|
||||||
episode_steps += 1
|
episode_steps += 1
|
||||||
total_numsteps += 1
|
total_numsteps += 1
|
||||||
prog.update(1)
|
prog.update(task1, advance=1)
|
||||||
episode_reward += reward
|
episode_reward += reward
|
||||||
|
|
||||||
# log env stuff
|
# log env stuff
|
||||||
|
|||||||
+20
-20
@@ -46,26 +46,26 @@ class GenerativeResnet3Headless(nn.Module):
|
|||||||
self.res4 = ResidualBlock(channel_size * 4, channel_size * 4)
|
self.res4 = ResidualBlock(channel_size * 4, channel_size * 4)
|
||||||
|
|
||||||
|
|
||||||
self.conv4 = nn.ConvTranspose2d(channel_size * 4, channel_size * 2, kernel_size=4, stride=2, padding=1,
|
# self.conv4 = nn.ConvTranspose2d(channel_size * 4, channel_size * 2, kernel_size=4, stride=2, padding=1,
|
||||||
output_padding=1)
|
# output_padding=1)
|
||||||
self.bn4 = nn.BatchNorm2d(channel_size * 2)
|
# self.bn4 = nn.BatchNorm2d(channel_size * 2)
|
||||||
|
|
||||||
self.conv5 = nn.ConvTranspose2d(channel_size * 2, channel_size, kernel_size=4, stride=2, padding=2,
|
# self.conv5 = nn.ConvTranspose2d(channel_size * 2, channel_size, kernel_size=4, stride=2, padding=2,
|
||||||
output_padding=1)
|
# output_padding=1)
|
||||||
self.bn5 = nn.BatchNorm2d(channel_size)
|
# self.bn5 = nn.BatchNorm2d(channel_size)
|
||||||
|
|
||||||
self.conv6 = nn.ConvTranspose2d(channel_size, channel_size, kernel_size=9, stride=1, padding=4)
|
# self.conv6 = nn.ConvTranspose2d(channel_size, channel_size, kernel_size=9, stride=1, padding=4)
|
||||||
|
|
||||||
self.pos_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2)
|
# self.pos_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2)
|
||||||
self.cos_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2)
|
# self.cos_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2)
|
||||||
self.sin_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2)
|
# self.sin_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2)
|
||||||
self.width_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2)
|
# self.width_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2)
|
||||||
|
|
||||||
self.dropout = dropout
|
# self.dropout = dropout
|
||||||
self.dropout_pos = nn.Dropout(p=prob)
|
# self.dropout_pos = nn.Dropout(p=prob)
|
||||||
self.dropout_cos = nn.Dropout(p=prob)
|
# self.dropout_cos = nn.Dropout(p=prob)
|
||||||
self.dropout_sin = nn.Dropout(p=prob)
|
# self.dropout_sin = nn.Dropout(p=prob)
|
||||||
self.dropout_wid = nn.Dropout(p=prob)
|
# self.dropout_wid = nn.Dropout(p=prob)
|
||||||
|
|
||||||
# freeze above params
|
# freeze above params
|
||||||
for param in self.parameters():
|
for param in self.parameters():
|
||||||
@@ -122,12 +122,12 @@ class ProcessObservation(nn.Module):
|
|||||||
os.path.dirname(os.path.abspath(__file__)),
|
os.path.dirname(os.path.abspath(__file__)),
|
||||||
'data/nets/cornell-randsplit-rgbd-grconvnet3-drop1-ch16/epoch_30_iou_0.97.pt'
|
'data/nets/cornell-randsplit-rgbd-grconvnet3-drop1-ch16/epoch_30_iou_0.97.pt'
|
||||||
)
|
)
|
||||||
self.feature_extractor = GenerativeResnet3Headless().eval()
|
self.feature_extractor = GenerativeResnet3Headless()#.half()
|
||||||
self.feature_extractor.load_state_dict(state_dict=torch.load(grconvnet3_path))
|
self.feature_extractor.load_state_dict(state_dict=torch.load(grconvnet3_path), strict=False)
|
||||||
|
|
||||||
old_img_size = (res[0], res[1], 8)
|
old_img_size = (res[0], res[1], 8)
|
||||||
new_img_size = (res[0]//16-1, res[1]//16-1, 8)
|
new_img_size = (res[0]//16-1, res[1]//16-1, 8)
|
||||||
self.reduce_action_space = int(np.prod(old_img_size) - np.prod(new_img_size))
|
self.reduce_obs_space = int(np.prod(old_img_size) - np.prod(new_img_size))
|
||||||
|
|
||||||
def __call__(self, obs):
|
def __call__(self, obs):
|
||||||
"""
|
"""
|
||||||
@@ -135,7 +135,7 @@ class ProcessObservation(nn.Module):
|
|||||||
|
|
||||||
This assumes the observations ends in 2 rgbd images with shape (224, 244, 4)
|
This assumes the observations ends in 2 rgbd images with shape (224, 244, 4)
|
||||||
"""
|
"""
|
||||||
# import pdb; pdb.set_trace()
|
assert obs.shape[-1] > self.res[0] * self.res[1] * 8
|
||||||
h, w = self.res
|
h, w = self.res
|
||||||
px = h * w
|
px = h * w
|
||||||
base_rgbd = obs[:, -px * 4:].reshape((-1, h, w, 4))
|
base_rgbd = obs[:, -px * 4:].reshape((-1, h, w, 4))
|
||||||
|
|||||||
+95
-2
@@ -5,7 +5,23 @@ import hickle
|
|||||||
import os
|
import os
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
class ReplayMemory:
|
import lz4.frame
|
||||||
|
import cloudpickle as pickle
|
||||||
|
|
||||||
|
def pack(data):
|
||||||
|
data = pickle.dumps(data)
|
||||||
|
data = lz4.frame.compress(data)
|
||||||
|
# data = base64.b64encode(data).decode("ascii")
|
||||||
|
return data
|
||||||
|
|
||||||
|
def unpack(data):
|
||||||
|
# data = base64.b64decode(data)
|
||||||
|
data = lz4.frame.decompress(data)
|
||||||
|
data = pickle.loads(data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
class ReplayMemory2:
|
||||||
def __init__(self, capacity, seed):
|
def __init__(self, capacity, seed):
|
||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
self.capacity = capacity
|
self.capacity = capacity
|
||||||
@@ -15,11 +31,14 @@ class ReplayMemory:
|
|||||||
def push(self, state, action, reward, next_state, done):
|
def push(self, state, action, reward, next_state, done):
|
||||||
if len(self.buffer) < self.capacity:
|
if len(self.buffer) < self.capacity:
|
||||||
self.buffer.append(None)
|
self.buffer.append(None)
|
||||||
self.buffer[self.position] = (state, action, reward, next_state, done)
|
batch = (state, action, reward, next_state, done)
|
||||||
|
# batch = pack(batch) # slow it down 10x
|
||||||
|
self.buffer[self.position] = batch
|
||||||
self.position = (self.position + 1) % self.capacity
|
self.position = (self.position + 1) % self.capacity
|
||||||
|
|
||||||
def sample(self, batch_size):
|
def sample(self, batch_size):
|
||||||
batch = random.sample(self.buffer, batch_size)
|
batch = random.sample(self.buffer, batch_size)
|
||||||
|
# batch = [unpack(d) for d in batch]
|
||||||
state, action, reward, next_state, done = map(np.stack, zip(*batch))
|
state, action, reward, next_state, done = map(np.stack, zip(*batch))
|
||||||
return state, action, reward, next_state, done
|
return state, action, reward, next_state, done
|
||||||
|
|
||||||
@@ -35,3 +54,77 @@ class ReplayMemory:
|
|||||||
if memory_path is not None:
|
if memory_path is not None:
|
||||||
self.buffer = hickle.load(memory_path)
|
self.buffer = hickle.load(memory_path)
|
||||||
self.position = len(self.buffer)
|
self.position = len(self.buffer)
|
||||||
|
|
||||||
|
|
||||||
|
class ReplayMemory:
|
||||||
|
def __init__(self, capacity, seed, observation_dim, action_dim):
|
||||||
|
random.seed(seed)
|
||||||
|
self.capacity = capacity
|
||||||
|
self._observations = np.zeros((capacity, observation_dim), dtype='float16')
|
||||||
|
self._actions = np.zeros((capacity, action_dim))
|
||||||
|
self._rewards = np.zeros((capacity, 1))
|
||||||
|
self._next_obs = np.zeros((capacity, observation_dim), dtype='float16')
|
||||||
|
self._terminals = np.zeros((capacity, 1), dtype='uint8')
|
||||||
|
self.position = 0
|
||||||
|
self._size = 0
|
||||||
|
|
||||||
|
def push(self, state, action, reward, next_state, done):
|
||||||
|
self._observations[self.position] = state
|
||||||
|
self._actions[self.position] = action
|
||||||
|
self._rewards[self.position] = reward
|
||||||
|
self._next_obs[self.position] = next_state
|
||||||
|
self._terminals[self.position] = done
|
||||||
|
self.position = (self.position + 1) % self.capacity
|
||||||
|
if self._size<self.capacity:
|
||||||
|
self._size += 1
|
||||||
|
|
||||||
|
def sample(self, batch_size):
|
||||||
|
n = min(self.position, self.capacity)
|
||||||
|
indices = np.random.choice(n, size=batch_size)
|
||||||
|
state = self._observations[indices]
|
||||||
|
action = self._actions[indices]
|
||||||
|
reward = self._rewards[indices]
|
||||||
|
next_state = self._next_obs[indices]
|
||||||
|
done = self._terminals[indices]
|
||||||
|
return state, action, reward, next_state, done
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self._size
|
||||||
|
|
||||||
|
|
||||||
|
# class BatchedReplayMemory:
|
||||||
|
# def __init__(self, capacity, seed, action_dim, observation_dim):
|
||||||
|
# random.seed(seed)
|
||||||
|
# self.capacity = capacity
|
||||||
|
# self._observations = np.zeros((capacity, observation_dim))
|
||||||
|
# self._actions = np.zeros((capacity, action_dim), dtype='float16')
|
||||||
|
# self._rewards = np.zeros((capacity, 1))
|
||||||
|
# self._next_obs = np.zeros((capacity, observation_dim), dtype='float16')
|
||||||
|
# self._terminals = np.zeros((capacity, 1), dtype='uint8')
|
||||||
|
# self.position = 0
|
||||||
|
# raise NotImplementedError()
|
||||||
|
|
||||||
|
# def push(self, state, action, reward, next_state, done):
|
||||||
|
# self._observations[self.position] = state
|
||||||
|
# self._actions[self.position] = action
|
||||||
|
# self._rewards[self.position] = reward
|
||||||
|
# self._next_obs[self.position] = next_state
|
||||||
|
# self._terminals[self.position] = done
|
||||||
|
# if self.position > self.capacity:
|
||||||
|
# # write to a dask capable file
|
||||||
|
# self.position = (self.position + 1) % self.capacity
|
||||||
|
# raise NotImplementedError()
|
||||||
|
|
||||||
|
# def sample(self, batch_size):
|
||||||
|
# # first choose a historic dask file, and this one
|
||||||
|
# # sample from both
|
||||||
|
# indices = np.random.choice(self._size, size=batch_size)
|
||||||
|
# state = self._observations[indices]
|
||||||
|
# action = self._actions[indices]
|
||||||
|
# reward = self._rewards[indices]
|
||||||
|
# next_state = self._next_obs[indices]
|
||||||
|
# done = self._terminals[indices]
|
||||||
|
# return state, action, reward, next_state, done
|
||||||
|
|
||||||
|
# def __len__(self):
|
||||||
|
# return len(self._observations)
|
||||||
|
|||||||
@@ -44,16 +44,20 @@ class SAC(object):
|
|||||||
self.alpha = 0
|
self.alpha = 0
|
||||||
self.automatic_entropy_tuning = False
|
self.automatic_entropy_tuning = False
|
||||||
self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
|
self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
|
||||||
self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
|
self.policy_optim = Adam(
|
||||||
|
list(self.policy.parameters()) + list(process_obs.parameters()),
|
||||||
|
lr=args.lr)
|
||||||
|
|
||||||
def select_action(self, obs, evaluate=False):
|
def select_action(self, obs, evaluate=False):
|
||||||
obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0)
|
with torch.no_grad():
|
||||||
state = self.process_obs(obs)
|
obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0)
|
||||||
if evaluate is False:
|
state = self.process_obs(obs)
|
||||||
action, _, _ = self.policy.sample(state)
|
if evaluate is False:
|
||||||
else:
|
action, _, _ = self.policy.sample(state)
|
||||||
_, _, action = self.policy.sample(state)
|
else:
|
||||||
return action.detach().cpu().numpy()[0]
|
_, _, action = self.policy.sample(state)
|
||||||
|
action = action.detach().cpu().numpy()[0]
|
||||||
|
return action
|
||||||
|
|
||||||
def update_parameters(self, memory, batch_size, updates):
|
def update_parameters(self, memory, batch_size, updates):
|
||||||
# Sample a batch from memory
|
# Sample a batch from memory
|
||||||
|
|||||||
Reference in New Issue
Block a user