I’m allocating memory for the host with malloc however I’m getting a segmentation fault I’ve also tried using new, calloc, cudaHostAlloc with the cudaHostAllocDefault flag but still getting that error but no errors accessing from the cuda main function?
extern "C"
__host__
void deproject_depth_cuda(uint16_t *serialization_point, uint32_t * world_counter, uint8_t * voxels, const rs2_intrinsics & intrin, const uint16_t * depth, float depth_scale, double w, double x, double y, double z)
{
int count = intrin.height * intrin.width;
int numBlocks = count / RS2_CUDA_THREADS_PER_BLOCK ;
uint16_t *dev_serializ_point = 0;
uint16_t *ray_points = 0;
uint32_t *dev_world_counter = 0;
uint8_t * dev_voxels = 0;
uint8_t * ray_filter = 0;
uint16_t *dev_depth = 0;
rs2_intrinsics* dev_intrin = 0;
cudaError_t result;
free(serialization_point);
result = cudaMalloc(&ray_points, count * sizeof(uint16_t) * 3);
//std::cout<<"Stage 1"<<std::endl;
assert(result == cudaSuccess);
result = cudaMalloc(&dev_voxels, 16000000);
//std::cout<<"Stage 2"<<std::endl;
assert(result == cudaSuccess);
result = cudaMalloc(&ray_filter, 16000000);
//std::cout<<"Stage 3"<<std::endl;
assert(result == cudaSuccess);
result = cudaMalloc(&dev_depth, count * sizeof(uint16_t));
//std::cout<<"Stage 4"<<std::endl;
assert(result == cudaSuccess);
result = cudaMalloc(&dev_intrin, sizeof(rs2_intrinsics));
//std::cout<<"Stage 5"<<std::endl;
assert(result == cudaSuccess);
result = cudaMalloc(&dev_world_counter, sizeof(uint32_t));
//std::cout<<"Stage 6"<<std::endl;
assert(result == cudaSuccess);
result = cudaMemcpy(dev_voxels, voxels, 16000000, cudaMemcpyHostToDevice);
//std::cout<<"Stage 7"<<std::endl;
assert(result == cudaSuccess);
result = cudaMemcpy(dev_depth, depth, count * sizeof(uint16_t), cudaMemcpyHostToDevice);
//std::cout<<"Stage 8"<<std::endl;
assert(result == cudaSuccess);
result = cudaMemcpy(dev_intrin, &intrin, sizeof(rs2_intrinsics), cudaMemcpyHostToDevice);
//std::cout<<"Stage 9"<<std::endl;
assert(result == cudaSuccess);
result = cudaMemcpy(dev_world_counter, world_counter, sizeof(uint32_t), cudaMemcpyHostToDevice);
//std::cout<<"Stage 10"<<std::endl;
assert(result == cudaSuccess);
//std::cout<<"Deproject_depth "<<count<<std::endl;
kernel_deproject_depth_cuda<<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK>>>(ray_filter, ray_points, dev_intrin, dev_depth, depth_scale, w,x,y,z);
cudaFree(ray_filter);
cudaDeviceSynchronize();
//std::cout<<"Voxelization "<<std::endl;
voxelization_cuda<<<128, 256>>>(dev_world_counter, dev_voxels, ray_points, count);
cudaDeviceSynchronize();
//std::cout<<"RayCasting "<<std::endl;
ray_casting_map<<<256, 256>>>(dev_world_counter, dev_voxels, ray_points, count);
cudaDeviceSynchronize();
result = cudaMemcpy (world_counter, dev_world_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost);
//std::cout<<"Stage 11"<<std::endl;
assert(result == cudaSuccess);
result = cudaMemcpy(voxels, dev_voxels, 16000000, cudaMemcpyDeviceToHost);
//std::cout<<"Stage 12"<<std::endl;
assert(result == cudaSuccess);
result = cudaMalloc(&dev_serializ_point, *world_counter * sizeof(uint16_t) * 3);
// std::cout<<"Stage 13"<<std::endl;
assert(result == cudaSuccess);
serialization_point = (uint16_t*)malloc(*world_counter * sizeof(uint16_t) * 3);
// std::cout<<"Serialization "<<std::endl;
serialization_cuda<<<128, 256>>>(dev_voxels, dev_serializ_point);
result = cudaMemcpy(serialization_point, dev_serializ_point, *world_counter * sizeof(uint16_t) * 3, cudaMemcpyDeviceToHost);
// std::cout<<"Stage 15"<<std::endl;
assert(result == cudaSuccess);
std::cout<<" arr_g: "<<serialization_point[0]<<std::endl;
std::cout<<" arr_g: "<<serialization_point[1]<<std::endl;
std::cout<<" arr_g: "<<serialization_point[2]<<std::endl;
cudaDeviceSynchronize();
cudaFree(dev_serializ_point);
cudaFree(ray_points);
cudaFree(dev_voxels);
cudaFree(dev_depth);
cudaFree(dev_intrin);
}
serialization_cuda
__global__
void serialization_cuda(const uint8_t * voxels, uint16_t * output)
{
uint32_t i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= 16000000) {
return;
}
uint32_t stride = blockDim.x * gridDim.x;
__shared__ uint32_t counter;
counter = 0;
for (uint32_t j = i; j < 16000000; j += stride) {
if(voxels[j]){
__threadfence();
uint32_t i = atomicAdd(&counter, 1);
//__syncthreads();
output[i * 3 + 2] = j / (400 * 400);
uint32_t remaining = j % (400 * 400);
output[i * 3 + 1] = remaining / 400;
output[i * 3] = remaining % 400;
}