diff --git a/.gitignore b/.gitignore index 2fe5092..772d884 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ __pycache__ runs build checkpoints +*.prof +.lvimrc +.vimtags diff --git a/models/Pointnet2Cls.py b/models/Pointnet2Cls.py index 90379d4..a4caee6 100644 --- a/models/Pointnet2Cls.py +++ b/models/Pointnet2Cls.py @@ -88,9 +88,8 @@ class Pointnet2MSG(nn.Module): npoint=512, radii=[0.1, 0.2, 0.4], nsamples=[32, 64, 128], - mlps=[[input_channels, 32, 32, - 64], [input_channels, 64, 64, 128], - [input_channels, 64, 96, 128]] + mlps=[[input_channels, 64], [input_channels, 128], + [input_channels, 128]] ) ) @@ -100,9 +99,8 @@ class Pointnet2MSG(nn.Module): npoint=128, radii=[0.2, 0.4, 0.8], nsamples=[16, 32, 64], - mlps=[[input_channels, 64, 64, - 128], [input_channels, 128, 128, 256], - [input_channels, 128, 128, 256]] + mlps=[[input_channels, 128], [input_channels, 256], + [input_channels, 256]] ) ) self.SA_modules.append( @@ -136,7 +134,6 @@ if __name__ == "__main__": model = Pointnet2MSG(3) model.cuda() - optimizer = optim.Adam(model.parameters(), lr=1e-2) model_fn = model_fn_decorator(nn.CrossEntropyLoss()) diff --git a/utils/cinclude/cuda_utils.h b/utils/cinclude/cuda_utils.h index e991bcb..741e2d5 100644 --- a/utils/cinclude/cuda_utils.h +++ b/utils/cinclude/cuda_utils.h @@ -1,24 +1,12 @@ #ifndef _CUDA_UTILS_H #define _CUDA_UTILS_H -#ifdef __cplusplus -extern "C" { -#endif +#include inline int opt_n_threads(int work_size) { - unsigned int n_threads = work_size; - n_threads--; - n_threads |= n_threads >> 1; - n_threads |= n_threads >> 2; - n_threads |= n_threads >> 4; - n_threads |= n_threads >> 8; - n_threads |= n_threads >> 16; - n_threads++; + const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); - return max(min(n_threads / 2, 512), 2); + return max(min(1 << pow_2, 512), 32); } -#ifdef __cplusplus -} -#endif #endif diff --git a/utils/csrc/sampling_gpu.cu b/utils/csrc/sampling_gpu.cu index aa5c00d..95c0257 100644 --- a/utils/csrc/sampling_gpu.cu +++ b/utils/csrc/sampling_gpu.cu @@ -13,9 +13,10 @@ __global__ void gather_points_kernel(int b, int n, int c, int m, for (int i = blockIdx.x; i < b; i += gridDim.x) { for (int j = blockIdx.y * blockDim.x + threadIdx.x; j < m; j += blockDim.x * gridDim.y) { - int a = idx[i * m + j]; - memcpy(out + (i * m + j) * c, points + (i * n + a) * c, - sizeof(float) * c); + const int jj = idx[i * m + j]; + for (int l = 0; l < c; ++l) { + out[(i * m + j) * c + l] = points[(i * n + jj) * c + l]; + } } } } @@ -25,7 +26,7 @@ void gather_points_kernel_wrapper(int b, int n, int c, int npoints, float *out, cudaStream_t stream) { cudaError_t err; - gather_points_kernel<<>>(b, n, c, npoints, points, idx, out); err = cudaGetLastError();