mirror of
https://github.com/wassname/Pointnet2_PyTorch.git
synced 2026-06-27 16:00:07 +08:00
85 lines
2.7 KiB
Plaintext
85 lines
2.7 KiB
Plaintext
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
#include "cuda_utils.h"
|
|
#include "group_points_gpu.h"
|
|
|
|
// input: points(b, c, n) idx(b, npoints, nsample)
|
|
// output: out(b, c, npoints, nsample)
|
|
__global__ void group_points_kernel(int b, int c, int n, int npoints,
|
|
int nsample,
|
|
const float *__restrict__ points,
|
|
const int *__restrict__ idx,
|
|
float *__restrict__ out) {
|
|
int batch_index = blockIdx.x;
|
|
points += batch_index * n * c;
|
|
idx += batch_index * npoints * nsample;
|
|
out += batch_index * npoints * nsample * c;
|
|
|
|
const int index = threadIdx.y * blockDim.x + threadIdx.x;
|
|
const int stride = blockDim.y * blockDim.x;
|
|
for (int i = index; i < c * npoints; i += stride) {
|
|
const int l = i / npoints;
|
|
const int j = i % npoints;
|
|
for (int k = 0; k < nsample; ++k) {
|
|
int ii = idx[j * nsample + k];
|
|
out[(l * npoints + j) * nsample + k] = points[l * n + ii];
|
|
}
|
|
}
|
|
}
|
|
|
|
void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample,
|
|
const float *points, const int *idx,
|
|
float *out, cudaStream_t stream) {
|
|
|
|
cudaError_t err;
|
|
group_points_kernel<<<b, opt_block_config(npoints, c), 0, stream>>>(
|
|
b, c, n, npoints, nsample, points, idx, out);
|
|
|
|
err = cudaGetLastError();
|
|
if (cudaSuccess != err) {
|
|
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
|
exit(-1);
|
|
}
|
|
}
|
|
|
|
// input: grad_out(b, c, npoints, nsample), idx(b, npoints, nsample)
|
|
// output: grad_points(b, c, n)
|
|
__global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
|
|
int nsample,
|
|
const float *__restrict__ grad_out,
|
|
const int *__restrict__ idx,
|
|
float *__restrict__ grad_points) {
|
|
int batch_index = blockIdx.x;
|
|
grad_out += batch_index * npoints * nsample * c;
|
|
idx += batch_index * npoints * nsample;
|
|
grad_points += batch_index * n * c;
|
|
|
|
const int index = threadIdx.y * blockDim.x + threadIdx.x;
|
|
const int stride = blockDim.y * blockDim.x;
|
|
for (int i = index; i < c * npoints; i += stride) {
|
|
const int l = i / npoints;
|
|
const int j = i % npoints;
|
|
for (int k = 0; k < nsample; ++k) {
|
|
int ii = idx[j * nsample + k];
|
|
atomicAdd(grad_points + l * n + ii,
|
|
grad_out[(l * npoints + j) * nsample + k]);
|
|
}
|
|
}
|
|
}
|
|
|
|
void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
|
|
int nsample, const float *grad_out,
|
|
const int *idx, float *grad_points,
|
|
cudaStream_t stream) {
|
|
cudaError_t err;
|
|
group_points_grad_kernel<<<b, opt_block_config(npoints, c), 0, stream>>>(
|
|
b, c, n, npoints, nsample, grad_out, idx, grad_points);
|
|
|
|
err = cudaGetLastError();
|
|
if (cudaSuccess != err) {
|
|
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
|
exit(-1);
|
|
}
|
|
}
|