diff --git a/.gitignore b/.gitignore
index 2fe5092..772d884 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,6 @@ __pycache__
 runs
 build
 checkpoints
+*.prof
+.lvimrc
+.vimtags
diff --git a/models/Pointnet2Cls.py b/models/Pointnet2Cls.py
index 90379d4..a4caee6 100644
--- a/models/Pointnet2Cls.py
+++ b/models/Pointnet2Cls.py
@@ -88,9 +88,8 @@ class Pointnet2MSG(nn.Module):
                 npoint=512,
                 radii=[0.1, 0.2, 0.4],
                 nsamples=[32, 64, 128],
-                mlps=[[input_channels, 32, 32,
-                       64], [input_channels, 64, 64, 128],
-                      [input_channels, 64, 96, 128]]
+                mlps=[[input_channels, 64], [input_channels, 128],
+                      [input_channels, 128]]
             )
         )
 
@@ -100,9 +99,8 @@ class Pointnet2MSG(nn.Module):
                 npoint=128,
                 radii=[0.2, 0.4, 0.8],
                 nsamples=[16, 32, 64],
-                mlps=[[input_channels, 64, 64,
-                       128], [input_channels, 128, 128, 256],
-                      [input_channels, 128, 128, 256]]
+                mlps=[[input_channels, 128], [input_channels, 256],
+                      [input_channels, 256]]
             )
         )
         self.SA_modules.append(
@@ -136,7 +134,6 @@ if __name__ == "__main__":
     model = Pointnet2MSG(3)
     model.cuda()
 
-
     optimizer = optim.Adam(model.parameters(), lr=1e-2)
 
     model_fn = model_fn_decorator(nn.CrossEntropyLoss())
diff --git a/utils/cinclude/cuda_utils.h b/utils/cinclude/cuda_utils.h
index e991bcb..741e2d5 100644
--- a/utils/cinclude/cuda_utils.h
+++ b/utils/cinclude/cuda_utils.h
@@ -1,24 +1,12 @@
 #ifndef _CUDA_UTILS_H
 #define _CUDA_UTILS_H
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+#include <cmath>
 
 inline int opt_n_threads(int work_size) {
-	unsigned int n_threads = work_size;
-	n_threads--;
-	n_threads |= n_threads >> 1;
-	n_threads |= n_threads >> 2;
-	n_threads |= n_threads >> 4;
-	n_threads |= n_threads >> 8;
-	n_threads |= n_threads >> 16;
-	n_threads++;
+    const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
 
-	return max(min(n_threads / 2, 512), 2);
+    return max(min(1 << pow_2, 512), 32);
 }
 
-#ifdef __cplusplus
-}
-#endif
 #endif
diff --git a/utils/csrc/sampling_gpu.cu b/utils/csrc/sampling_gpu.cu
index aa5c00d..95c0257 100644
--- a/utils/csrc/sampling_gpu.cu
+++ b/utils/csrc/sampling_gpu.cu
@@ -13,9 +13,10 @@ __global__ void gather_points_kernel(int b, int n, int c, int m,
     for (int i = blockIdx.x; i < b; i += gridDim.x) {
 	for (int j = blockIdx.y * blockDim.x + threadIdx.x; j < m;
 	     j += blockDim.x * gridDim.y) {
-	    int a = idx[i * m + j];
-	    memcpy(out + (i * m + j) * c, points + (i * n + a) * c,
-		   sizeof(float) * c);
+	    const int jj = idx[i * m + j];
+	    for (int l = 0; l < c; ++l) {
+		out[(i * m + j) * c + l] = points[(i * n + jj) * c + l];
+	    }
 	}
     }
 }
@@ -25,7 +26,7 @@ void gather_points_kernel_wrapper(int b, int n, int c, int npoints,
 				  float *out, cudaStream_t stream) {
 
     cudaError_t err;
-    gather_points_kernel<<<dim3(2, 8, 1), opt_n_threads(npoints) / 4, 0,
+    gather_points_kernel<<<dim3(b, 8, 1), opt_n_threads(npoints), 0,
 			   stream>>>(b, n, c, npoints, points, idx, out);
 
     err = cudaGetLastError();