memcpy is super slow in cuda kernels....

This commit is contained in:
erikwijmans
2018-01-29 18:22:27 -05:00
parent 009c30e5e3
commit c4ddd6b099
7 changed files with 30 additions and 14 deletions
+1
View File
@@ -3,3 +3,4 @@ __pycache__
.autoenv*
runs
build
checkpoints
+3 -2
View File
@@ -5,7 +5,8 @@ import os, sys, h5py, subprocess, shlex
def _get_data_files(list_filename):
return [line.rstrip() for line in open(list_filename)]
with open(list_filename) as f:
return [line.rstrip() for line in f]
def _load_data_file(name):
@@ -76,7 +77,7 @@ class Indoor3DSemSeg(data.Dataset):
pt_idxs = np.arange(0, self.num_points)
np.random.shuffle(pt_idxs)
current_points = torch.from_numpy(self.points[idx, pt_idxs, :]).type(
current_points = torch.from_numpy(self.points[idx, pt_idxs]).type(
torch.FloatTensor
)
current_labels = torch.from_numpy(self.labels[idx, pt_idxs]).type(
+3 -1
View File
@@ -8,7 +8,8 @@ sys.path.append(BASE_DIR)
def _get_data_files(list_filename):
return [line.rstrip()[5:] for line in open(list_filename)]
with open(list_filename) as f:
return [line.rstrip()[5:] for line in f]
def _load_data_file(name):
@@ -80,6 +81,7 @@ class ModelNet40Cls(data.Dataset):
def set_num_points(self, pts):
self.num_points = pts
self.actual_number_of_points = pts
def randomize(self):
self.actual_number_of_points = min(
+4 -2
View File
@@ -128,13 +128,15 @@ if __name__ == "__main__":
from torch.autograd import Variable
import numpy as np
import torch.optim as optim
import torch.autograd.profiler as profiler
B = 2
N = 32
N = 2048
inputs = torch.randn(B, N, 9).cuda()
labels = torch.from_numpy(np.random.randint(0, 3, size=B)).cuda()
model = Pointnet2SSG(3)
model = Pointnet2MSG(3)
model.cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-2)
model_fn = model_fn_decorator(nn.CrossEntropyLoss())
+5 -2
View File
@@ -16,6 +16,9 @@ import utils.pytorch_utils as pt_utils
import utils.data_utils as d_utils
import argparse
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
def parse_args():
parser = argparse.ArgumentParser(
@@ -141,8 +144,8 @@ if __name__ == "__main__":
model,
model_fn,
optimizer,
checkpoint_name="cls_checkpoint",
best_name="cls_best",
checkpoint_name="checkpoints/cls_xyz",
best_name="checkpoints/cls_xyz_best",
lr_scheduler=lr_scheduler,
bnm_scheduler=bnm_scheduler
)
+3 -2
View File
@@ -21,8 +21,9 @@ __global__ void group_points_kernel(int b, int n, int c, int npoints,
for (int j = index; j < npoints; j += stride) {
for (int k = 0; k < nsample; ++k) {
int ii = idx[j * nsample + k];
memcpy(out + j * nsample * c + k * c, points + ii * c,
sizeof(float) * c);
for (int l = 0; l < c; ++l) {
out[j * nsample * c + k * c + l] = points[ii * c + l];
}
}
}
}
+11 -5
View File
@@ -31,7 +31,8 @@ class PointnetSAModuleMSG(nn.Module):
radii: List[float],
nsamples: List[int],
mlps: List[List[int]],
bn: bool = True
bn: bool = True,
use_xyz: bool = True
):
super().__init__()
@@ -43,7 +44,9 @@ class PointnetSAModuleMSG(nn.Module):
for i in range(len(radii)):
radius = radii[i]
nsample = nsamples[i]
self.groupers.append(pointnet2_utils.QueryAndGroup(radius, nsample))
self.groupers.append(
pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz)
)
mlp_spec = mlps[i]
self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn))
@@ -111,7 +114,8 @@ class PointnetSAModule(nn.Module):
npoint: int = None,
radius: float = None,
nsample: int = None,
bn: bool = True
bn: bool = True,
use_xyz: bool = True
):
super().__init__()
self.npoint = npoint
@@ -119,9 +123,11 @@ class PointnetSAModule(nn.Module):
if self.npoint is not None:
assert radius is not None
assert nsample is not None
self.grouper = pointnet2_utils.QueryAndGroup(radius, nsample)
self.grouper = pointnet2_utils.QueryAndGroup(
radius, nsample, use_xyz=use_xyz
)
else:
self.grouper = pointnet2_utils.GroupAll()
self.grouper = pointnet2_utils.GroupAll(use_xyz=use_xyz)
self.mlp = pt_utils.SharedMLP(mlp, bn=bn)