diff --git a/Makefile b/Makefile index 76c1ecf..44d303d 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,12 @@ python=/home/wassname/anaconda/envs/diygym3/bin/python date=2021-01-03_13-30-07 LOGURU_LEVEL=INFO +# ulimit -S -m 35000000 +# ulimit -S -v 35000000 + run: - LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 4 --automatic_entropy_tuning true + LOGURU_LEVEL=INFO ${python} -m pdb main.py --cuda --automatic_entropy_tuning true --replay_size 15000 --load auto + # LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --automatic_entropy_tuning true --replay_size 20000 --load auto # LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 2 --load auto --alpha 0.1 --tau 1 --target_update_interval 1000 # LOGURU_LEVEL=INFO ${python} main.py --demonstrations data/demonstrations --cuda --updates_per_step 2 --load auto --tau 1 --target_update_interval 1000 --policy Deterministic diff --git a/README.md b/README.md index c3d5587..03024d8 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,10 @@ Modified for wassname's apple gym +changes: +- save +- process_obs with grconvnet +- logging + make run make play diff --git a/main.py b/main.py index 440c003..d3dc7aa 100644 --- a/main.py +++ b/main.py @@ -7,7 +7,6 @@ from pathlib import Path import logging import torch from sac import SAC -from torch.utils.tensorboard import SummaryWriter from replay_memory import ReplayMemory from load_demonstrations import load_demonstrations import apple_gym.env @@ -15,11 +14,25 @@ import pickle from process_obs import ProcessObservation # from torchinfo import summary from tqdm.auto import tqdm +from torch.utils.tensorboard import SummaryWriter from loguru import logger from rich import print from rich.logging import RichHandler +from rich.progress import ( + ProgressColumn, + BarColumn, + DownloadColumn, + TextColumn, + TransferSpeedColumn, + TimeRemainingColumn, + Progress, + TaskID, + TimeElapsedColumn, + SpinnerColumn, + Text +) logging.basicConfig(level=logging.INFO, handlers=[RichHandler(rich_tracebacks=True, markup=True)]) logger.configure(handlers=[{"sink": RichHandler(markup=True), "format": "{message}"}]) @@ -89,13 +102,15 @@ torch.manual_seed(args.seed) np.random.seed(args.seed) # A visual network -observation_space=env.observation_space.shape[0] +action_dim = env.action_space.shape[0] +observation_dim=env.observation_space.shape[0] process_obs=ProcessObservation() -observation_space=observation_space - process_obs.reduce_action_space -logger.info(f"process_obs reduces obs_space {env.observation_space.shape[0]}-{process_obs.reduce_action_space}={observation_space}") +observation_dim=observation_dim - process_obs.reduce_obs_space + +logger.info(f"process_obs reduces obs_space {env.observation_space.shape[0]}-{process_obs.reduce_obs_space}={observation_dim}") # Agent -agent = SAC(observation_space, env.action_space, args, process_obs) +agent = SAC(observation_dim, env.action_space, args, process_obs) # TODO # summary(model, input_size=(batch_size, 1, 28, 28)) @@ -109,7 +124,7 @@ logger.info(f"log name {log_name}") save_dir=Path("models") / log_name # Memory -memory=ReplayMemory(args.replay_size, args.seed) +memory=ReplayMemory(args.replay_size, args.seed, env.observation_space.shape[0], action_dim) def save(save_dir): @@ -141,7 +156,32 @@ if args.demonstrations: total_numsteps = 0 updates = 0 -with tqdm(unit='steps', mininterval=5) as prog: +class SpeedColumn(ProgressColumn): + """Renders human readable transfer speed.""" + + def render(self, task: "Task") -> Text: + """Show data transfer speed.""" + speed = task.speed + if speed is None: + return Text("?", style="progress.data.speed") + return Text(f"{speed:2.2f} it/s", style="progress.data.speed") + +with Progress( + SpinnerColumn(), + "[progress.description]{task.description}", + BarColumn(), + TextColumn("{task.completed}/{task.total}"), + "[", + TimeElapsedColumn(), + "<", + TimeRemainingColumn(), + ',', + SpeedColumn(), + ']', + refresh_per_second=1, speed_estimate_period=360 + ) as prog: + task1 = prog.add_task("[red]steps", total=args.num_steps) + task2 = prog.add_task("[red]updates", total=args.num_steps) for i_episode in itertools.count(0): print('1') episode_reward = 0 @@ -168,11 +208,12 @@ with tqdm(unit='steps', mininterval=5) as prog: writer.add_scalar('entropy_temperature/alpha', alpha, updates) updates += 1 + prog.update(task2, advance=1) next_state, reward, done, info = env.step(action) # Step episode_steps += 1 total_numsteps += 1 - prog.update(1) + prog.update(task1, advance=1) episode_reward += reward # log env stuff diff --git a/process_obs.py b/process_obs.py index 60827e0..1f5ede3 100644 --- a/process_obs.py +++ b/process_obs.py @@ -46,26 +46,26 @@ class GenerativeResnet3Headless(nn.Module): self.res4 = ResidualBlock(channel_size * 4, channel_size * 4) - self.conv4 = nn.ConvTranspose2d(channel_size * 4, channel_size * 2, kernel_size=4, stride=2, padding=1, - output_padding=1) - self.bn4 = nn.BatchNorm2d(channel_size * 2) + # self.conv4 = nn.ConvTranspose2d(channel_size * 4, channel_size * 2, kernel_size=4, stride=2, padding=1, + # output_padding=1) + # self.bn4 = nn.BatchNorm2d(channel_size * 2) - self.conv5 = nn.ConvTranspose2d(channel_size * 2, channel_size, kernel_size=4, stride=2, padding=2, - output_padding=1) - self.bn5 = nn.BatchNorm2d(channel_size) + # self.conv5 = nn.ConvTranspose2d(channel_size * 2, channel_size, kernel_size=4, stride=2, padding=2, + # output_padding=1) + # self.bn5 = nn.BatchNorm2d(channel_size) - self.conv6 = nn.ConvTranspose2d(channel_size, channel_size, kernel_size=9, stride=1, padding=4) + # self.conv6 = nn.ConvTranspose2d(channel_size, channel_size, kernel_size=9, stride=1, padding=4) - self.pos_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2) - self.cos_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2) - self.sin_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2) - self.width_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2) + # self.pos_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2) + # self.cos_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2) + # self.sin_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2) + # self.width_output = nn.Conv2d(in_channels=channel_size, out_channels=output_channels, kernel_size=2) - self.dropout = dropout - self.dropout_pos = nn.Dropout(p=prob) - self.dropout_cos = nn.Dropout(p=prob) - self.dropout_sin = nn.Dropout(p=prob) - self.dropout_wid = nn.Dropout(p=prob) + # self.dropout = dropout + # self.dropout_pos = nn.Dropout(p=prob) + # self.dropout_cos = nn.Dropout(p=prob) + # self.dropout_sin = nn.Dropout(p=prob) + # self.dropout_wid = nn.Dropout(p=prob) # freeze above params for param in self.parameters(): @@ -122,12 +122,12 @@ class ProcessObservation(nn.Module): os.path.dirname(os.path.abspath(__file__)), 'data/nets/cornell-randsplit-rgbd-grconvnet3-drop1-ch16/epoch_30_iou_0.97.pt' ) - self.feature_extractor = GenerativeResnet3Headless().eval() - self.feature_extractor.load_state_dict(state_dict=torch.load(grconvnet3_path)) + self.feature_extractor = GenerativeResnet3Headless()#.half() + self.feature_extractor.load_state_dict(state_dict=torch.load(grconvnet3_path), strict=False) old_img_size = (res[0], res[1], 8) new_img_size = (res[0]//16-1, res[1]//16-1, 8) - self.reduce_action_space = int(np.prod(old_img_size) - np.prod(new_img_size)) + self.reduce_obs_space = int(np.prod(old_img_size) - np.prod(new_img_size)) def __call__(self, obs): """ @@ -135,7 +135,7 @@ class ProcessObservation(nn.Module): This assumes the observations ends in 2 rgbd images with shape (224, 244, 4) """ - # import pdb; pdb.set_trace() + assert obs.shape[-1] > self.res[0] * self.res[1] * 8 h, w = self.res px = h * w base_rgbd = obs[:, -px * 4:].reshape((-1, h, w, 4)) diff --git a/replay_memory.py b/replay_memory.py index 37d6580..2767e41 100644 --- a/replay_memory.py +++ b/replay_memory.py @@ -5,7 +5,23 @@ import hickle import os from loguru import logger -class ReplayMemory: +import lz4.frame +import cloudpickle as pickle + +def pack(data): + data = pickle.dumps(data) + data = lz4.frame.compress(data) + # data = base64.b64encode(data).decode("ascii") + return data + +def unpack(data): + # data = base64.b64decode(data) + data = lz4.frame.decompress(data) + data = pickle.loads(data) + return data + + +class ReplayMemory2: def __init__(self, capacity, seed): random.seed(seed) self.capacity = capacity @@ -15,11 +31,14 @@ class ReplayMemory: def push(self, state, action, reward, next_state, done): if len(self.buffer) < self.capacity: self.buffer.append(None) - self.buffer[self.position] = (state, action, reward, next_state, done) + batch = (state, action, reward, next_state, done) + # batch = pack(batch) # slow it down 10x + self.buffer[self.position] = batch self.position = (self.position + 1) % self.capacity def sample(self, batch_size): batch = random.sample(self.buffer, batch_size) + # batch = [unpack(d) for d in batch] state, action, reward, next_state, done = map(np.stack, zip(*batch)) return state, action, reward, next_state, done @@ -35,3 +54,77 @@ class ReplayMemory: if memory_path is not None: self.buffer = hickle.load(memory_path) self.position = len(self.buffer) + + +class ReplayMemory: + def __init__(self, capacity, seed, observation_dim, action_dim): + random.seed(seed) + self.capacity = capacity + self._observations = np.zeros((capacity, observation_dim), dtype='float16') + self._actions = np.zeros((capacity, action_dim)) + self._rewards = np.zeros((capacity, 1)) + self._next_obs = np.zeros((capacity, observation_dim), dtype='float16') + self._terminals = np.zeros((capacity, 1), dtype='uint8') + self.position = 0 + self._size = 0 + + def push(self, state, action, reward, next_state, done): + self._observations[self.position] = state + self._actions[self.position] = action + self._rewards[self.position] = reward + self._next_obs[self.position] = next_state + self._terminals[self.position] = done + self.position = (self.position + 1) % self.capacity + if self._size self.capacity: +# # write to a dask capable file +# self.position = (self.position + 1) % self.capacity +# raise NotImplementedError() + +# def sample(self, batch_size): +# # first choose a historic dask file, and this one +# # sample from both +# indices = np.random.choice(self._size, size=batch_size) +# state = self._observations[indices] +# action = self._actions[indices] +# reward = self._rewards[indices] +# next_state = self._next_obs[indices] +# done = self._terminals[indices] +# return state, action, reward, next_state, done + +# def __len__(self): +# return len(self._observations) diff --git a/sac.py b/sac.py index 6d91743..3111da6 100644 --- a/sac.py +++ b/sac.py @@ -44,16 +44,20 @@ class SAC(object): self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) - self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) + self.policy_optim = Adam( + list(self.policy.parameters()) + list(process_obs.parameters()), + lr=args.lr) def select_action(self, obs, evaluate=False): - obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0) - state = self.process_obs(obs) - if evaluate is False: - action, _, _ = self.policy.sample(state) - else: - _, _, action = self.policy.sample(state) - return action.detach().cpu().numpy()[0] + with torch.no_grad(): + obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0) + state = self.process_obs(obs) + if evaluate is False: + action, _, _ = self.policy.sample(state) + else: + _, _, action = self.policy.sample(state) + action = action.detach().cpu().numpy()[0] + return action def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory