#include #include #include "cuda_utils.h" #include "group_points_gpu.h" // input: points(b, c, n) idx(b, npoints, nsample) // output: out(b, c, npoints, nsample) __global__ void group_points_kernel(int b, int c, int n, int npoints, int nsample, const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) { int batch_index = blockIdx.x; points += batch_index * n * c; idx += batch_index * npoints * nsample; out += batch_index * npoints * nsample * c; const int index = threadIdx.y * blockDim.x + threadIdx.x; const int stride = blockDim.y * blockDim.x; for (int i = index; i < c * npoints; i += stride) { const int l = i / npoints; const int j = i % npoints; for (int k = 0; k < nsample; ++k) { int ii = idx[j * nsample + k]; out[(l * npoints + j) * nsample + k] = points[l * n + ii]; } } } void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample, const float *points, const int *idx, float *out, cudaStream_t stream) { cudaError_t err; group_points_kernel<<>>( b, c, n, npoints, nsample, points, idx, out); err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } } // input: grad_out(b, c, npoints, nsample), idx(b, npoints, nsample) // output: grad_points(b, c, n) __global__ void group_points_grad_kernel(int b, int c, int n, int npoints, int nsample, const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) { int batch_index = blockIdx.x; grad_out += batch_index * npoints * nsample * c; idx += batch_index * npoints * nsample; grad_points += batch_index * n * c; const int index = threadIdx.y * blockDim.x + threadIdx.x; const int stride = blockDim.y * blockDim.x; for (int i = index; i < c * npoints; i += stride) { const int l = i / npoints; const int j = i % npoints; for (int k = 0; k < nsample; ++k) { int ii = idx[j * nsample + k]; atomicAdd(grad_points + l * n + ii, grad_out[(l * npoints + j) * nsample + k]); } } } void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints, int nsample, const float *grad_out, const int *idx, float *grad_points, cudaStream_t stream) { cudaError_t err; group_points_grad_kernel<<>>( b, c, n, npoints, nsample, grad_out, idx, grad_points); err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); exit(-1); } }