This commit is contained in:
wassname
2021-01-17 11:29:35 +08:00
parent 5534d4b078
commit e4fd67f3b5
6 changed files with 186 additions and 39 deletions
+5 -1
View File
@@ -1,8 +1,12 @@
python=/home/wassname/anaconda/envs/diygym3/bin/python python=/home/wassname/anaconda/envs/diygym3/bin/python
date=2021-01-03_13-30-07 date=2021-01-03_13-30-07
LOGURU_LEVEL=INFO LOGURU_LEVEL=INFO
# ulimit -S -m 35000000
# ulimit -S -v 35000000
run: run:
LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 4 --automatic_entropy_tuning true LOGURU_LEVEL=INFO ${python} -m pdb main.py --cuda --automatic_entropy_tuning true --replay_size 15000 --load auto
# LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --automatic_entropy_tuning true --replay_size 20000 --load auto
# LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 2 --load auto --alpha 0.1 --tau 1 --target_update_interval 1000 # LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 2 --load auto --alpha 0.1 --tau 1 --target_update_interval 1000
# LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 2 --load auto --tau 1 --target_update_interval 1000 --policy Deterministic # LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 2 --load auto --tau 1 --target_update_interval 1000 --policy Deterministic
+5
View File
@@ -1,5 +1,10 @@
Modified for wassname's apple gym Modified for wassname's apple gym
changes:
- save
- process_obs with grconvnet
- logging
make run make run
make play make play
+49 -8
View File
@@ -7,7 +7,6 @@ from pathlib import Path
import logging import logging
import torch import torch
from sac import SAC from sac import SAC
from torch.utils.tensorboard import SummaryWriter
from replay_memory import ReplayMemory from replay_memory import ReplayMemory
from load_demonstrations import load_demonstrations from load_demonstrations import load_demonstrations
import apple_gym.env import apple_gym.env
@@ -15,11 +14,25 @@ import pickle
from process_obs import ProcessObservation from process_obs import ProcessObservation
# from torchinfo import summary # from torchinfo import summary
from tqdm.auto import tqdm from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter
from loguru import logger from loguru import logger
from rich import print from rich import print
from rich.logging import RichHandler from rich.logging import RichHandler
from rich.progress import (
ProgressColumn,
BarColumn,
DownloadColumn,
TextColumn,
TransferSpeedColumn,
TimeRemainingColumn,
Progress,
TaskID,
TimeElapsedColumn,
SpinnerColumn,
Text
)
logging.basicConfig(level=logging.INFO, handlers=[RichHandler(rich_tracebacks=True, markup=True)]) logging.basicConfig(level=logging.INFO, handlers=[RichHandler(rich_tracebacks=True, markup=True)])
logger.configure(handlers=[{"sink": RichHandler(markup=True), logger.configure(handlers=[{"sink": RichHandler(markup=True),
"format": "{message}"}]) "format": "{message}"}])
@@ -89,13 +102,15 @@ torch.manual_seed(args.seed)
np.random.seed(args.seed) np.random.seed(args.seed)
# A visual network # A visual network
observation_space=env.observation_space.shape[0] action_dim = env.action_space.shape[0]
observation_dim=env.observation_space.shape[0]
process_obs=ProcessObservation() process_obs=ProcessObservation()
observation_space=observation_space - process_obs.reduce_action_space observation_dim=observation_dim - process_obs.reduce_obs_space
logger.info(f"process_obs reduces obs_space {env.observation_space.shape[0]}-{process_obs.reduce_action_space}={observation_space}")
logger.info(f"process_obs reduces obs_space {env.observation_space.shape[0]}-{process_obs.reduce_obs_space}={observation_dim}")
# Agent # Agent
agent = SAC(observation_space, env.action_space, args, process_obs) agent = SAC(observation_dim, env.action_space, args, process_obs)
# TODO # TODO
# summary(model, input_size=(batch_size, 1, 28, 28)) # summary(model, input_size=(batch_size, 1, 28, 28))
@@ -109,7 +124,7 @@ logger.info(f"log name {log_name}")
save_dir=Path("models") / log_name save_dir=Path("models") / log_name
# Memory # Memory
memory=ReplayMemory(args.replay_size, args.seed) memory=ReplayMemory(args.replay_size, args.seed, env.observation_space.shape[0], action_dim)
def save(save_dir): def save(save_dir):
@@ -141,7 +156,32 @@ if args.demonstrations:
total_numsteps = 0 total_numsteps = 0
updates = 0 updates = 0
with tqdm(unit='steps', mininterval=5) as prog: class SpeedColumn(ProgressColumn):
"""Renders human readable transfer speed."""
def render(self, task: "Task") -> Text:
"""Show data transfer speed."""
speed = task.speed
if speed is None:
return Text("?", style="progress.data.speed")
return Text(f"{speed:2.2f} it/s", style="progress.data.speed")
with Progress(
SpinnerColumn(),
"[progress.description]{task.description}",
BarColumn(),
TextColumn("{task.completed}/{task.total}"),
"[",
TimeElapsedColumn(),
"<",
TimeRemainingColumn(),
',',
SpeedColumn(),
']',
refresh_per_second=1, speed_estimate_period=360
) as prog:
task1 = prog.add_task("[red]steps", total=args.num_steps)
task2 = prog.add_task("[red]updates", total=args.num_steps)
for i_episode in itertools.count(0): for i_episode in itertools.count(0):
print('1') print('1')
episode_reward = 0 episode_reward = 0
@@ -168,11 +208,12 @@ with tqdm(unit='steps', mininterval=5) as prog:
writer.add_scalar('entropy_temperature/alpha', alpha, updates) writer.add_scalar('entropy_temperature/alpha', alpha, updates)
updates += 1 updates += 1
prog.update(task2, advance=1)
next_state, reward, done, info = env.step(action) # Step next_state, reward, done, info = env.step(action) # Step
episode_steps += 1 episode_steps += 1
total_numsteps += 1 total_numsteps += 1
prog.update(1) prog.update(task1, advance=1)
episode_reward += reward episode_reward += reward
# log env stuff # log env stuff
+20 -20
View File
@@ -46,26 +46,26 @@ class GenerativeResnet3Headless(nn.Module):
self.res4 = ResidualBlock(channel_size * 4, channel_size * 4) self.res4 = ResidualBlock(channel_size * 4, channel_size * 4)
self.conv4 = nn.ConvTranspose2d(channel_size * 4, channel_size * 2, kernel_size=4, stride=2, padding=1, # self.conv4 = nn.ConvTranspose2d(channel_size * 4, channel_size * 2, kernel_size=4, stride=2, padding=1,
output_padding=1) # output_padding=1)
self.bn4 = nn.BatchNorm2d(channel_size * 2) # self.bn4 = nn.BatchNorm2d(channel_size * 2)
self.conv5 = nn.ConvTranspose2d(channel_size * 2, channel_size, kernel_size=4, stride=2, padding=2, # self.conv5 = nn.ConvTranspose2d(channel_size * 2, channel_size, kernel_size=4, stride=2, padding=2,
output_padding=1) # output_padding=1)
self.bn5 = nn.BatchNorm2d(channel_size) # self.bn5 = nn.BatchNorm2d(channel_size)
self.conv6 = nn.ConvTranspose2d(channel_size, channel_size, kernel_size=9, stride=1, padding=4) # self.conv6 = nn.ConvTranspose2d(channel_size, channel_size, kernel_size=9, stride=1, padding=4)
self.pos_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2) # self.pos_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2)
self.cos_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2) # self.cos_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2)
self.sin_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2) # self.sin_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2)
self.width_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2) # self.width_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2)
self.dropout = dropout # self.dropout = dropout
self.dropout_pos = nn.Dropout(p=prob) # self.dropout_pos = nn.Dropout(p=prob)
self.dropout_cos = nn.Dropout(p=prob) # self.dropout_cos = nn.Dropout(p=prob)
self.dropout_sin = nn.Dropout(p=prob) # self.dropout_sin = nn.Dropout(p=prob)
self.dropout_wid = nn.Dropout(p=prob) # self.dropout_wid = nn.Dropout(p=prob)
# freeze above params # freeze above params
for param in self.parameters(): for param in self.parameters():
@@ -122,12 +122,12 @@ class ProcessObservation(nn.Module):
os.path.dirname(os.path.abspath(__file__)), os.path.dirname(os.path.abspath(__file__)),
'data/nets/cornell-randsplit-rgbd-grconvnet3-drop1-ch16/epoch_30_iou_0.97.pt' 'data/nets/cornell-randsplit-rgbd-grconvnet3-drop1-ch16/epoch_30_iou_0.97.pt'
) )
self.feature_extractor = GenerativeResnet3Headless().eval() self.feature_extractor = GenerativeResnet3Headless()#.half()
self.feature_extractor.load_state_dict(state_dict=torch.load(grconvnet3_path)) self.feature_extractor.load_state_dict(state_dict=torch.load(grconvnet3_path), strict=False)
old_img_size = (res[0], res[1], 8) old_img_size = (res[0], res[1], 8)
new_img_size = (res[0]//16-1, res[1]//16-1, 8) new_img_size = (res[0]//16-1, res[1]//16-1, 8)
self.reduce_action_space = int(np.prod(old_img_size) - np.prod(new_img_size)) self.reduce_obs_space = int(np.prod(old_img_size) - np.prod(new_img_size))
def __call__(self, obs): def __call__(self, obs):
""" """
@@ -135,7 +135,7 @@ class ProcessObservation(nn.Module):
This assumes the observations ends in 2 rgbd images with shape (224, 244, 4) This assumes the observations ends in 2 rgbd images with shape (224, 244, 4)
""" """
# import pdb; pdb.set_trace() assert obs.shape[-1] > self.res[0] * self.res[1] * 8
h, w = self.res h, w = self.res
px = h * w px = h * w
base_rgbd = obs[:, -px * 4:].reshape((-1, h, w, 4)) base_rgbd = obs[:, -px * 4:].reshape((-1, h, w, 4))
+95 -2
View File
@@ -5,7 +5,23 @@ import hickle
import os import os
from loguru import logger from loguru import logger
class ReplayMemory: import lz4.frame
import cloudpickle as pickle
def pack(data):
data = pickle.dumps(data)
data = lz4.frame.compress(data)
# data = base64.b64encode(data).decode("ascii")
return data
def unpack(data):
# data = base64.b64decode(data)
data = lz4.frame.decompress(data)
data = pickle.loads(data)
return data
class ReplayMemory2:
def __init__(self, capacity, seed): def __init__(self, capacity, seed):
random.seed(seed) random.seed(seed)
self.capacity = capacity self.capacity = capacity
@@ -15,11 +31,14 @@ class ReplayMemory:
def push(self, state, action, reward, next_state, done): def push(self, state, action, reward, next_state, done):
if len(self.buffer) < self.capacity: if len(self.buffer) < self.capacity:
self.buffer.append(None) self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done) batch = (state, action, reward, next_state, done)
# batch = pack(batch) # slow it down 10x
self.buffer[self.position] = batch
self.position = (self.position + 1) % self.capacity self.position = (self.position + 1) % self.capacity
def sample(self, batch_size): def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size) batch = random.sample(self.buffer, batch_size)
# batch = [unpack(d) for d in batch]
state, action, reward, next_state, done = map(np.stack, zip(*batch)) state, action, reward, next_state, done = map(np.stack, zip(*batch))
return state, action, reward, next_state, done return state, action, reward, next_state, done
@@ -35,3 +54,77 @@ class ReplayMemory:
if memory_path is not None: if memory_path is not None:
self.buffer = hickle.load(memory_path) self.buffer = hickle.load(memory_path)
self.position = len(self.buffer) self.position = len(self.buffer)
class ReplayMemory:
def __init__(self, capacity, seed, observation_dim, action_dim):
random.seed(seed)
self.capacity = capacity
self._observations = np.zeros((capacity, observation_dim), dtype='float16')
self._actions = np.zeros((capacity, action_dim))
self._rewards = np.zeros((capacity, 1))
self._next_obs = np.zeros((capacity, observation_dim), dtype='float16')
self._terminals = np.zeros((capacity, 1), dtype='uint8')
self.position = 0
self._size = 0
def push(self, state, action, reward, next_state, done):
self._observations[self.position] = state
self._actions[self.position] = action
self._rewards[self.position] = reward
self._next_obs[self.position] = next_state
self._terminals[self.position] = done
self.position = (self.position + 1) % self.capacity
if self._size<self.capacity:
self._size += 1
def sample(self, batch_size):
n = min(self.position, self.capacity)
indices = np.random.choice(n, size=batch_size)
state = self._observations[indices]
action = self._actions[indices]
reward = self._rewards[indices]
next_state = self._next_obs[indices]
done = self._terminals[indices]
return state, action, reward, next_state, done
def __len__(self):
return self._size
# class BatchedReplayMemory:
# def __init__(self, capacity, seed, action_dim, observation_dim):
# random.seed(seed)
# self.capacity = capacity
# self._observations = np.zeros((capacity, observation_dim))
# self._actions = np.zeros((capacity, action_dim), dtype='float16')
# self._rewards = np.zeros((capacity, 1))
# self._next_obs = np.zeros((capacity, observation_dim), dtype='float16')
# self._terminals = np.zeros((capacity, 1), dtype='uint8')
# self.position = 0
# raise NotImplementedError()
# def push(self, state, action, reward, next_state, done):
# self._observations[self.position] = state
# self._actions[self.position] = action
# self._rewards[self.position] = reward
# self._next_obs[self.position] = next_state
# self._terminals[self.position] = done
# if self.position > self.capacity:
# # write to a dask capable file
# self.position = (self.position + 1) % self.capacity
# raise NotImplementedError()
# def sample(self, batch_size):
# # first choose a historic dask file, and this one
# # sample from both
# indices = np.random.choice(self._size, size=batch_size)
# state = self._observations[indices]
# action = self._actions[indices]
# reward = self._rewards[indices]
# next_state = self._next_obs[indices]
# done = self._terminals[indices]
# return state, action, reward, next_state, done
# def __len__(self):
# return len(self._observations)
+12 -8
View File
@@ -44,16 +44,20 @@ class SAC(object):
self.alpha = 0 self.alpha = 0
self.automatic_entropy_tuning = False self.automatic_entropy_tuning = False
self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.policy_optim = Adam(
list(self.policy.parameters()) + list(process_obs.parameters()),
lr=args.lr)
def select_action(self, obs, evaluate=False): def select_action(self, obs, evaluate=False):
obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0) with torch.no_grad():
state = self.process_obs(obs) obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0)
if evaluate is False: state = self.process_obs(obs)
action, _, _ = self.policy.sample(state) if evaluate is False:
else: action, _, _ = self.policy.sample(state)
_, _, action = self.policy.sample(state) else:
return action.detach().cpu().numpy()[0] _, _, action = self.policy.sample(state)
action = action.detach().cpu().numpy()[0]
return action
def update_parameters(self, memory, batch_size, updates): def update_parameters(self, memory, batch_size, updates):
# Sample a batch from memory # Sample a batch from memory