memcpy is super slow in cuda kernels....

2026-06-27 16:00:07 +08:00 · 2018-01-29 18:22:27 -05:00
parent 009c30e5e3
commit c4ddd6b099
7 changed files with 30 additions and 14 deletions
@@ -3,3 +3,4 @@ __pycache__
 .autoenv*
 runs
 build
+checkpoints
@@ -5,7 +5,8 @@ import os, sys, h5py, subprocess, shlex


 def _get_data_files(list_filename):
-    return [line.rstrip() for line in open(list_filename)]
+    with open(list_filename) as f:
+        return [line.rstrip() for line in f]


 def _load_data_file(name):
@@ -76,7 +77,7 @@ class Indoor3DSemSeg(data.Dataset):
        pt_idxs = np.arange(0, self.num_points)
        np.random.shuffle(pt_idxs)

-        current_points = torch.from_numpy(self.points[idx, pt_idxs, :]).type(
+        current_points = torch.from_numpy(self.points[idx, pt_idxs]).type(
            torch.FloatTensor
        )
        current_labels = torch.from_numpy(self.labels[idx, pt_idxs]).type(
@@ -8,7 +8,8 @@ sys.path.append(BASE_DIR)


 def _get_data_files(list_filename):
-    return [line.rstrip()[5:] for line in open(list_filename)]
+    with open(list_filename) as f:
+        return [line.rstrip()[5:] for line in f]


 def _load_data_file(name):
@@ -80,6 +81,7 @@ class ModelNet40Cls(data.Dataset):

    def set_num_points(self, pts):
        self.num_points = pts
+        self.actual_number_of_points = pts

    def randomize(self):
        self.actual_number_of_points = min(
@@ -128,13 +128,15 @@ if __name__ == "__main__":
    from torch.autograd import Variable
    import numpy as np
    import torch.optim as optim
+    import torch.autograd.profiler as profiler
    B = 2
-    N = 32
+    N = 2048
    inputs = torch.randn(B, N, 9).cuda()
    labels = torch.from_numpy(np.random.randint(0, 3, size=B)).cuda()
-    model = Pointnet2SSG(3)
+    model = Pointnet2MSG(3)
    model.cuda()

+
    optimizer = optim.Adam(model.parameters(), lr=1e-2)

    model_fn = model_fn_decorator(nn.CrossEntropyLoss())
@@ -16,6 +16,9 @@ import utils.pytorch_utils as pt_utils
 import utils.data_utils as d_utils
 import argparse

+torch.backends.cudnn.enabled = True
+torch.backends.cudnn.benchmark = True
+

 def parse_args():
    parser = argparse.ArgumentParser(
@@ -141,8 +144,8 @@ if __name__ == "__main__":
        model,
        model_fn,
        optimizer,
-        checkpoint_name="cls_checkpoint",
-        best_name="cls_best",
+        checkpoint_name="checkpoints/cls_xyz",
+        best_name="checkpoints/cls_xyz_best",
        lr_scheduler=lr_scheduler,
        bnm_scheduler=bnm_scheduler
    )
@@ -21,8 +21,9 @@ __global__ void group_points_kernel(int b, int n, int c, int npoints,
    for (int j = index; j < npoints; j += stride) {
 	for (int k = 0; k < nsample; ++k) {
 	    int ii = idx[j * nsample + k];
-	    memcpy(out + j * nsample * c + k * c, points + ii * c,
-		   sizeof(float) * c);
+	    for (int l = 0; l < c; ++l) {
+		out[j * nsample * c + k * c + l] = points[ii * c + l];
+	    }
 	}
    }
 }
@@ -31,7 +31,8 @@ class PointnetSAModuleMSG(nn.Module):
            radii: List[float],
            nsamples: List[int],
            mlps: List[List[int]],
-            bn: bool = True
+            bn: bool = True,
+            use_xyz: bool = True
    ):
        super().__init__()

@@ -43,7 +44,9 @@ class PointnetSAModuleMSG(nn.Module):
        for i in range(len(radii)):
            radius = radii[i]
            nsample = nsamples[i]
-            self.groupers.append(pointnet2_utils.QueryAndGroup(radius, nsample))
+            self.groupers.append(
+                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz)
+            )
            mlp_spec = mlps[i]
            self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn))

@@ -111,7 +114,8 @@ class PointnetSAModule(nn.Module):
            npoint: int = None,
            radius: float = None,
            nsample: int = None,
-            bn: bool = True
+            bn: bool = True,
+            use_xyz: bool = True
    ):
        super().__init__()
        self.npoint = npoint
@@ -119,9 +123,11 @@ class PointnetSAModule(nn.Module):
        if self.npoint is not None:
            assert radius is not None
            assert nsample is not None
-            self.grouper = pointnet2_utils.QueryAndGroup(radius, nsample)
+            self.grouper = pointnet2_utils.QueryAndGroup(
+                radius, nsample, use_xyz=use_xyz
+            )
        else:
-            self.grouper = pointnet2_utils.GroupAll()
+            self.grouper = pointnet2_utils.GroupAll(use_xyz=use_xyz)

        self.mlp = pt_utils.SharedMLP(mlp, bn=bn)