How to allocate memory for host in cuda main function?

ru.sencis · July 14, 2023, 12:44pm

I’m allocating memory for the host with malloc however I’m getting a segmentation fault I’ve also tried using new, calloc, cudaHostAlloc with the cudaHostAllocDefault flag but still getting that error but no errors accessing from the cuda main function?

extern "C"
__host__
void deproject_depth_cuda(uint16_t *serialization_point, uint32_t * world_counter, uint8_t * voxels, const rs2_intrinsics & intrin, const uint16_t * depth, float depth_scale, double w, double x, double y, double z)
{
    int count = intrin.height * intrin.width;
    int numBlocks = count / RS2_CUDA_THREADS_PER_BLOCK ;

    uint16_t *dev_serializ_point = 0;
    uint16_t *ray_points = 0;
    uint32_t *dev_world_counter = 0;
    uint8_t * dev_voxels = 0;
    uint8_t * ray_filter = 0;
    uint16_t *dev_depth = 0;
    rs2_intrinsics* dev_intrin = 0;
    cudaError_t result;

    free(serialization_point);

    result = cudaMalloc(&ray_points, count * sizeof(uint16_t) * 3);
    //std::cout<<"Stage 1"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMalloc(&dev_voxels, 16000000);
    //std::cout<<"Stage 2"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMalloc(&ray_filter, 16000000);
    //std::cout<<"Stage 3"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMalloc(&dev_depth, count * sizeof(uint16_t));
    //std::cout<<"Stage 4"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMalloc(&dev_intrin, sizeof(rs2_intrinsics));
    //std::cout<<"Stage 5"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMalloc(&dev_world_counter, sizeof(uint32_t));
    //std::cout<<"Stage 6"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMemcpy(dev_voxels, voxels, 16000000, cudaMemcpyHostToDevice);
    //std::cout<<"Stage 7"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMemcpy(dev_depth, depth, count * sizeof(uint16_t), cudaMemcpyHostToDevice);
    //std::cout<<"Stage 8"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMemcpy(dev_intrin, &intrin, sizeof(rs2_intrinsics), cudaMemcpyHostToDevice);
    //std::cout<<"Stage 9"<<std::endl;
    assert(result == cudaSuccess);

    result = cudaMemcpy(dev_world_counter, world_counter, sizeof(uint32_t), cudaMemcpyHostToDevice);
    //std::cout<<"Stage 10"<<std::endl;
    assert(result == cudaSuccess);

    //std::cout<<"Deproject_depth "<<count<<std::endl;

    kernel_deproject_depth_cuda<<<numBlocks, RS2_CUDA_THREADS_PER_BLOCK>>>(ray_filter, ray_points,  dev_intrin, dev_depth, depth_scale, w,x,y,z);

    cudaFree(ray_filter);

    cudaDeviceSynchronize();

    //std::cout<<"Voxelization "<<std::endl;

    voxelization_cuda<<<128, 256>>>(dev_world_counter, dev_voxels, ray_points, count);

    cudaDeviceSynchronize();

   //std::cout<<"RayCasting "<<std::endl;

   ray_casting_map<<<256, 256>>>(dev_world_counter, dev_voxels, ray_points, count);

   cudaDeviceSynchronize();

   result = cudaMemcpy (world_counter, dev_world_counter, sizeof(uint32_t), cudaMemcpyDeviceToHost);
   //std::cout<<"Stage 11"<<std::endl;
   assert(result == cudaSuccess);

   result = cudaMemcpy(voxels, dev_voxels, 16000000, cudaMemcpyDeviceToHost);
   //std::cout<<"Stage 12"<<std::endl;
   assert(result == cudaSuccess);

   result = cudaMalloc(&dev_serializ_point, *world_counter * sizeof(uint16_t) * 3);
//   std::cout<<"Stage 13"<<std::endl;
   assert(result == cudaSuccess);

   serialization_point = (uint16_t*)malloc(*world_counter * sizeof(uint16_t) * 3);

 //  std::cout<<"Serialization "<<std::endl;

   serialization_cuda<<<128, 256>>>(dev_voxels, dev_serializ_point);

   result = cudaMemcpy(serialization_point, dev_serializ_point, *world_counter * sizeof(uint16_t) * 3, cudaMemcpyDeviceToHost);
//   std::cout<<"Stage 15"<<std::endl;
   assert(result == cudaSuccess);

   std::cout<<" arr_g: "<<serialization_point[0]<<std::endl;
   std::cout<<" arr_g: "<<serialization_point[1]<<std::endl;
   std::cout<<" arr_g: "<<serialization_point[2]<<std::endl;

   cudaDeviceSynchronize();

   cudaFree(dev_serializ_point);
   cudaFree(ray_points);
   cudaFree(dev_voxels);
   cudaFree(dev_depth);
   cudaFree(dev_intrin);
}

serialization_cuda

__global__

void serialization_cuda(const uint8_t * voxels, uint16_t * output)
{
    uint32_t i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i >= 16000000) {
        return;
    }

    uint32_t stride = blockDim.x * gridDim.x;

    __shared__ uint32_t counter;

    counter = 0;

  for (uint32_t j = i; j < 16000000; j += stride) {

    if(voxels[j]){

    __threadfence();
    uint32_t i = atomicAdd(&counter, 1);

    //__syncthreads();
    output[i * 3 + 2] = j / (400 * 400);
    uint32_t remaining = j % (400 * 400);
    output[i * 3 + 1] = remaining / 400;
    output[i * 3] = remaining % 400;

    }

Screenshot

Robert_Crovella · July 14, 2023, 2:06pm

a seg fault can always be localized to a single line of host code that is producing the fault. I suggest you start by identifying that line of code. It is not obvious which line is causing the seg fault from your post.

I also usually recommend that people seeking debugging help post a short complete example that demonstrates the problem, not snippets.

ru.sencis · July 14, 2023, 2:32pm

The error occurs after accessing the 0 element of the array serialization_point[0] and displaying its contents on the screen as shown by the arrow in the screenshot under the spoiler (arrow in line 313). But note that before that I already accessed this array element in the cuda main function and this did not cause an error (output arr_g value in screenshot). I believe that the allocated memory can be somehow reserved for the GPU and can only be accessed using in the cuda function and I did not find a way to allocate memory in the cuda main function so that it would be available on the CPU?

I apologize for the pieces of code, but that’s all that is happening at the moment with this variable, I did not publish the rest of the fragments, so as not to waste your time reading for the sake of declaring a variable in the code, for example.

Robert_Crovella · July 14, 2023, 3:31pm

I guess the first piece of code you have shown is from the function deproject_depth_cuda. And I guess the serialization_point in the function call is the same as serialization_point in the function body there.

So just guessing here based on that conjecture.

It looks like you are making a mistake around C/C++ pass-by-value to a function call. You cannot allocate a variable in the function the way you are doing it. Yes, that works inside the function, but not at the calling scope. This is a common C/C++ programming error representing a lack of understanding of pass-by-value characteristics, and it has nothing to do with CUDA.

There are numerous questions like this on various forums. here is an example.

ru.sencis · July 14, 2023, 4:46pm

Yes it is, thanks for taking the time! I also found another example with a similar situation and an explanation with a simple example.

system · July 28, 2023, 4:46pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
cudaMalloc causes segmentation fault 2 Mo is far from my 1,2 Go card memory limit CUDA Programming and Performance	7	7470	June 28, 2011
using cudaMalloc and cudaFree within a loop unspecified launch failure! CUDA Programming and Performance	21	37702	April 23, 2009
cudaMalloc error in big loop CUDA Programming and Performance	12	15613	May 21, 2008
Trouble allocating device memory for a struct CUDA Programming and Performance cuda	8	588	March 8, 2022
Allocating device memory for an struc inside an std::vector<struct> CUDA Programming and Performance	2	49	September 28, 2024
How to track down a Segmentation Fault in Big Programs CUDA Programming and Performance	5	1834	January 26, 2011
cudaHostAlloc can only allocate about 3.5GB of memory out of 128GB CUDA Programming and Performance	7	456	June 2, 2023
Segmentation fault when calling virtual function on host CUDA Programming and Performance	9	2481	September 10, 2019
segmentation fault at the first cudaMalloc with --device-emulation everything was fine CUDA Programming and Performance	10	4324	January 25, 2010
memory allocated by cannot be accessed by parent function pinned memory, mapped host memory, no-zer CUDA Programming and Performance	2	4767	May 20, 2011

How to allocate memory for host in cuda main function?

Related topics