Hi guys, I am quite new with CUDA Programming so bear with me, any comment to improve my codestyle or coding is appreciated.

I am currently trying to implement a VoxelGrid Filter for pointclouds using CUDA or Thrust. Currently I tried to follow the algorithm of the PCL Voxelfilter, so following steps:

- Calculate voxelgrid indices for each point.
- Sort the points and indices using those indices
- Count the number of unique indices
- Count number of points per index
- Add up XYZ and squared RGB values of the points with same index
- Divide the accumulated values with the weights and sqrt the RGB values

This, is sadly not as well performing as hoped, so I hope the community might have some suggestions on speeding up this process further.

```
struct PointDiv {
__host__ __device__
Point operator()(const uint32_t& w, const Voxel& v) {
Point p;
p.x = v.x / w;
p.y = v.y / w;
p.z = v.z / w;
uint8_t* rgb = (uint8_t*) &p.rgb;
rgb[2] = round(sqrt(v.r / w));
rgb[1] = round(sqrt(v.g / w));
rgb[0] = round(sqrt(v.b / w));
return p;
}
};
uint32_t voxel_grid(uint32_t num, thrust::device_vector<Voxel> d_voxel_cloud, float* out) {
thrust::device_vector<uint32_t> voxel_idxs(num);
// Step 1: Produce Indizes
thrust::transform(d_voxel_cloud.begin(), d_voxel_cloud.end(), voxel_idxs.begin(), PointToKey());
// Step 2: Sort by Idxs
thrust::sort_by_key(voxel_idxs.begin(), voxel_idxs.end(), d_voxel_cloud.begin());
// Step 3: Count Amount of Voxels
// number of histogram bins is equal to number of unique values (assumes data.size() > 0)
uint32_t num_voxels = thrust::inner_product(voxel_idxs.begin(), voxel_idxs.end() - 1, voxel_idxs.begin() + 1, 1, thrust::plus<uint32_t>(), thrust::not_equal_to<uint32_t>());
thrust::device_vector<uint32_t> d_weights(num_voxels);
thrust::device_vector<uint32_t> d_idx_reduced(num_voxels);
// Step 4: Produce "Histogram" for weights
thrust::reduce_by_key(voxel_idxs.begin(), voxel_idxs.end(), thrust::constant_iterator<uint32_t>(1), d_idx_reduced.begin(), d_weights.begin());
// Step 5: Merge all values with same idx
thrust::device_vector<uint32_t> d_idx_after_vox(num_voxels);
thrust::device_vector<Point> d_point_cloud_out(num_voxels);
thrust::reduce_by_key(thrust::device, voxel_idxs.begin(), voxel_idxs.end(), d_voxel_cloud.begin(), d_idx_after_vox.begin(), d_point_cloud_out.begin(), thrust::equal_to<uint32_t>(), thrust::plus<Voxel>());
// Step 6: Divide by weight
thrust::transform(d_weights.begin(), d_weights.end(), d_point_cloud_out.begin(), d_point_cloud_out.begin(), PointDiv());
thrust::copy(d_point_cloud_out.begin(), d_point_cloud_out.end(), out);
return num_voxels;
}
```