Unable to do image processing algorithms on GPU

I am having some trouble while doing the below operation

I am trying to convert rgb images to grayscale & I want to do this operation on GPU. Here I will be passing an array of pointers(Images) & I want to store the grayscale images in the d_output_images.
But, it is throwing some run time exceptions

__global__ void convert_gray_scale(int width, int height, int stack_size,
                                     unsigned char * d_input_images[],
                                     unsigned char * d_output_images[])
{
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  for(int index=0;index < stack_size; index++)
  {
    unsigned char b = d_input_images[index][((row * width + col) * 3)];
    unsigned char g = d_input_images[index][((row * width + col) * 3) + 1];
    unsigned char r = d_input_images[index][((row * width + col) * 3) + 2];

    d_output_images[index][row * width + col] = r * 0.299f + g * 0.587f + b * 0.114f; // issue is coming from this line

  }
}


int main()
{
    int number_of_images = 5;
    int width = 1912;
    int height = 1192; // Assume all the images are of this size & BGR
    unsigned int blockHeight = 16;
    unsigned int blockWidth = 16;

    unsigned char * h_input_images[number_of_images]; // Assume it has the input images
    unsigned char * h_output_images[number_of_images];
    unsigned char * d_input_images[number_of_images];
    unsigned char * d_output_images[number_of_images];
    
  for(int idx=0;idx < number_of_images; idx++)
  {
    //allocate memory of host's grey data array
    h_output_images[idx] = (unsigned char *)malloc(sizeof(unsigned char)* width * height);

    // Allocating the memory in the Device Memory
    cudaMalloc(d_input_images + idx, sizeof(unsigned char) * 3 * width * height);
    cudaMalloc(d_output_images + idx, sizeof(unsigned char) * width * height);

    // Copy the data from cpu to GPU
    cudaMemcpy(*(d_input_images + idx), *(h_input_images + idx), sizeof(unsigned char) * 3 * width * height, cudaMemcpyHostToDevice);
  }
  
  // Allocates block size and grid size
   dim3 threadsPerBlock(blockWidth ,blockHeight);
   dim3 blocksPerGrid((int)ceil((width)/blockWidth), (int)ceil((height)/blockHeight));

   convert_gray_scale<<<blocksPerGrid, threadsPerBlock>>>(width, height, number_of_images, d_input_images, d_output_images);

  for(int idx=0;idx < stack_size; idx++)
  {
    // Copy the data from gpu to cpu
    cudaMemcpy(*(h_output_images + idx), *(d_output_images + idx), sizeof(unsigned char) * width * height, cudaMemcpyDeviceToHost);
    cudaFree(d_input_images[idx]);
    cudaFree(d_output_images[idx]);
  }

}

But it is throwing some run time exceptions == “CUDA error: an illegal memory access was encountered”
Where am I doing wrong?

the proximal problem is that d_input_images is a pointer to host memory. You are attempting to dereference that pointer in device code. That is illegal in CUDA.

This is a common problem that trips up people who are working with arrays of pointers, and 2D arrays, in all their various flavors. This question/answer covers the topic in more detail with suggestions and worked examples for addressing this issue.

For a variety of reasons, the usual suggestion I would give in a case like this is to use the flattening idea. Arrange your images not as an array of pointers to individual images, but rather by a single pointer that points to the start of the images in memory, where all images are stored contiguously, and you reference an individual image by pointer arithmetic from the base pointer.

Hi @Robert_Crovella, thanks for suggesting the above solutions, however can you give me an example on flattening the images?(I have tried it, but its not working) and more over how can we convert array of pointer images to single pointer flattened image?

Here is an example, and it shows how to convert an array of pointer images on the host to a single pointer flattened image on the device:

__global__ void convert_gray_scale(int width, int height, int stack_size,
                                     unsigned char * d_input_images,
                                     unsigned char * d_output_images)
{
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  for(int index=0;index < stack_size; index++)
  {
    unsigned char b = d_input_images[((row * width + col + index*width*height) * 3)];
    unsigned char g = d_input_images[((row * width + col + index*width*height) * 3) + 1];
    unsigned char r = d_input_images[((row * width + col + index*width*height) * 3) + 2];

    d_output_images[row * width + col + index*width*height] = r * 0.299f + g * 0.587f + b * 0.114f; // issue is coming from this line

  }
}


int main()
{
    int number_of_images = 5;
    int width = 1912;
    int height = 1192; // Assume all the images are of this size & BGR
    unsigned int blockHeight = 16;
    unsigned int blockWidth = 16;

    unsigned char * h_input_images[number_of_images]; // Assume it has the input images
    unsigned char * h_output_images[number_of_images];
    unsigned char * d_input_images;
    unsigned char * d_output_images;
    cudaMalloc(&d_input_images, width*height*number_of_images*3*sizeof(unsigned char));
    cudaMalloc(&d_output_images, width*height*number_of_images*sizeof(unsigned char));

  for(int idx=0;idx < number_of_images; idx++)
  {
    h_output_images[idx] = (unsigned char *)malloc(sizeof(unsigned char)* width * height);
    h_input_images[idx] = (unsigned char *)malloc(sizeof(unsigned char)* width * height * 3);

    // Copy the data from cpu to GPU
    cudaMemcpy(d_input_images + idx*width*height*3, h_input_images[idx], sizeof(unsigned char) * 3 * width * height, cudaMemcpyHostToDevice);
  }

  // Allocates block size and grid size
   dim3 threadsPerBlock(blockWidth ,blockHeight);
   dim3 blocksPerGrid((int)ceil((width)/blockWidth), (int)ceil((height)/blockHeight));

   convert_gray_scale<<<blocksPerGrid, threadsPerBlock>>>(width, height, number_of_images, d_input_images, d_output_images);

  for(int idx=0;idx < number_of_images; idx++)
  {
    // Copy the data from gpu to cpu
    cudaMemcpy(h_output_images[idx], d_output_images + idx*width*height, sizeof(unsigned char) * width * height, cudaMemcpyDeviceToHost);
  }
  cudaFree(d_input_images);
  cudaFree(d_output_images);

}
1 Like

Thanks @Robert_Crovella

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.