How to allocate memory for host in cuda main function?

I’m allocating memory for the host with malloc however I’m getting a segmentation fault I’ve also tried using new, calloc, cudaHostAlloc with the cudaHostAllocDefault flag but still getting that error but no errors accessing from the cuda main function?

extern "C"
__host__
void deproject_depth_cuda(uint16_t *serialization_point, uint32_t * world_counter, uint8_t * voxels, const rs2_intrinsics & intrin, const uint16_t * depth, float depth_scale, double w, double x, double y, double z)
{
    int count = intrin.height * intrin.width;
    int numBlocks = count / RS2_CUDA_THREADS_PER_BLOCK ;

    uint16_t *dev_serializ_point = 0;
    uint16_t *ray_points = 0;
    uint32_t *dev_world_counter = 0;
    uint8_t * dev_voxels = 0;
    uint8_t * ray_filter = 0;
    uint16_t *dev_depth = 0;
    rs2_intrinsics* dev_intrin = 0;
    cudaError_t result;

    free(serialization_point);

    result = cudaMalloc(&ray_points, count * sizeof(uint16_t) * 3);
    //std::cout<<"Stage 1"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMalloc(&dev_voxels, 16000000);
    //std::cout<<"Stage 2"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMalloc(&ray_filter, 16000000);
    //std::cout<<"Stage 3"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMalloc(&dev_depth, count * sizeof(uint16_t));
    //std::cout<<"Stage 4"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMalloc(&dev_intrin, sizeof(rs2_intrinsics));
    //std::cout<<"Stage 5"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMalloc(&dev_world_counter, sizeof(uint32_t));
    //std::cout<<"Stage 6"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMemcpy(dev_voxels, voxels, 16000000, cudaMemcpyHostToDevice);
    //std::cout<<"Stage 7"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMemcpy(dev_depth, depth, count * sizeof(uint16_t), cudaMemcpyHostToDevice);
    //std::cout<<"Stage 8"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMemcpy(dev_intrin, &intrin, sizeof(rs2_intrinsics), cudaMemcpyHostToDevice);
    //std::cout<<"Stage 9"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMemcpy(dev_world_counter, world_counter, sizeof(uint32_t), cudaMemcpyHostToDevice);
    //std::cout<<"Stage 10"<<std::endl;
    assert(result == cudaSuccess);

    //std::cout<<"Deproject_depth "<<count<<std::endl;

    kernel_deproject_depth_cuda<<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK>>>(ray_filter, ray_points,  dev_intrin, dev_depth, depth_scale, w,x,y,z);

    cudaFree(ray_filter);

    cudaDeviceSynchronize();

    //std::cout<<"Voxelization "<<std::endl;

    voxelization_cuda<<<128, 256>>>(dev_world_counter, dev_voxels, ray_points, count);

    cudaDeviceSynchronize();

   //std::cout<<"RayCasting "<<std::endl;

   ray_casting_map<<<256, 256>>>(dev_world_counter, dev_voxels, ray_points, count);

   cudaDeviceSynchronize();

   result = cudaMemcpy (world_counter, dev_world_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost);
   //std::cout<<"Stage 11"<<std::endl;
   assert(result == cudaSuccess);

   result = cudaMemcpy(voxels, dev_voxels, 16000000, cudaMemcpyDeviceToHost);
   //std::cout<<"Stage 12"<<std::endl;
   assert(result == cudaSuccess);

   result = cudaMalloc(&dev_serializ_point, *world_counter * sizeof(uint16_t) * 3);
//   std::cout<<"Stage 13"<<std::endl;
   assert(result == cudaSuccess);

   serialization_point = (uint16_t*)malloc(*world_counter * sizeof(uint16_t) * 3);

 //  std::cout<<"Serialization "<<std::endl;

   serialization_cuda<<<128, 256>>>(dev_voxels, dev_serializ_point);

   result = cudaMemcpy(serialization_point, dev_serializ_point, *world_counter * sizeof(uint16_t) * 3, cudaMemcpyDeviceToHost);
//   std::cout<<"Stage 15"<<std::endl;
   assert(result == cudaSuccess);

   std::cout<<" arr_g: "<<serialization_point[0]<<std::endl;
   std::cout<<" arr_g: "<<serialization_point[1]<<std::endl;
   std::cout<<" arr_g: "<<serialization_point[2]<<std::endl;

   cudaDeviceSynchronize();

   cudaFree(dev_serializ_point);
   cudaFree(ray_points);
   cudaFree(dev_voxels);
   cudaFree(dev_depth);
   cudaFree(dev_intrin);
}

serialization_cuda
__global__

void serialization_cuda(const uint8_t * voxels, uint16_t * output)
{
    uint32_t i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i >= 16000000) {
        return;
    }

    uint32_t stride = blockDim.x * gridDim.x;

    __shared__ uint32_t counter;

    counter = 0;

  for (uint32_t j = i; j < 16000000; j += stride) {

    if(voxels[j]){

    __threadfence();
    uint32_t i = atomicAdd(&counter, 1);

    //__syncthreads();
    output[i * 3 + 2] = j / (400 * 400);
    uint32_t remaining = j % (400 * 400);
    output[i * 3 + 1] = remaining / 400;
    output[i * 3] = remaining % 400;

    }
Screenshot

a seg fault can always be localized to a single line of host code that is producing the fault. I suggest you start by identifying that line of code. It is not obvious which line is causing the seg fault from your post.

I also usually recommend that people seeking debugging help post a short complete example that demonstrates the problem, not snippets.

The error occurs after accessing the 0 element of the array serialization_point[0] and displaying its contents on the screen as shown by the arrow in the screenshot under the spoiler (arrow in line 313). But note that before that I already accessed this array element in the cuda main function and this did not cause an error (output arr_g value in screenshot). I believe that the allocated memory can be somehow reserved for the GPU and can only be accessed using in the cuda function and I did not find a way to allocate memory in the cuda main function so that it would be available on the CPU?

I apologize for the pieces of code, but that’s all that is happening at the moment with this variable, I did not publish the rest of the fragments, so as not to waste your time reading for the sake of declaring a variable in the code, for example.

I guess the first piece of code you have shown is from the function deproject_depth_cuda. And I guess the serialization_point in the function call is the same as serialization_point in the function body there.

So just guessing here based on that conjecture.

It looks like you are making a mistake around C/C++ pass-by-value to a function call. You cannot allocate a variable in the function the way you are doing it. Yes, that works inside the function, but not at the calling scope. This is a common C/C++ programming error representing a lack of understanding of pass-by-value characteristics, and it has nothing to do with CUDA.

There are numerous questions like this on various forums. here is an example.

1 Like

Yes it is, thanks for taking the time! I also found another example with a similar situation and an explanation with a simple example.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.