Downsampling an image with CUDA

joachim.hodara · August 13, 2020, 3:19pm

Hi all,

I am trying to write a simple down-sampling kernel using CUDA. Most of the examples I could find online use texture references, while this article suggests that a more modern approach would be to use texture objects.

I wrote a simple program, but somehow the texture fetch always returns 0. I must be doing something wrong. I checked my normalized coordinates in the kernel (u/v) and they all seem reasonable (between 0.0 and 1.0). The error must be related to my texture object I imagine.

Most examples I could find online (including in the NVIDIA programming guide) use cudaMemcpyToArray, but this function has recently been deprecated. I am using cudaMemcpy2DToArray here instead.

If someone could let me know what I messed up in the code below, I would greatly appreciate it!

#include <iostream>
#include <vector>

__global__ 
void downsample(unsigned char* output, cudaTextureObject_t tex, std::size_t width_out, std::size_t height_out) {
  int i = blockIdx.x *blockDim.x + threadIdx.x;
  int j = blockIdx.y *blockDim.y + threadIdx.y;

  if(i >= width_out && j >= height_out) 
    return;

  float u = (i + 0.5f) / width_out;
  float v = (j + 0.5f) / height_out;

  uchar4 pixel = tex2D<uchar4>(tex, u, v); // <-- pixel = {0, 0, 0, 0}

  int globId = (j*width_out + i)*4;
  output[globId  ] = pixel.x;
  output[globId+1] = pixel.y;
  output[globId+2] = pixel.z;
  output[globId+3] = pixel.w;
}

int main() {
  std::size_t width = 256;
  std::size_t height = 128;
  std::size_t n_channels = 4;

  std::vector<unsigned char> image(width*height*n_channels);
  for(std::size_t j = 0; j < height; ++j) {
    for(std::size_t i = 0; i < width; ++i) {
      image[4*(j*width+i)] = 255*j/height;
      image[4*(j*width+i)+1] = 255*i/width;
      image[4*(j*width+i)+2] = 55;
      image[4*(j*width+i)+3] = 255;
    }
  }

  auto channelDesc = cudaCreateChannelDesc<uchar4>();
  cudaArray* cuArray;
  cudaMallocArray(&cuArray, &channelDesc, width, height);
  cudaMemcpy2DToArray(cuArray, 0, 0, image.data(), width*n_channels, width*n_channels, height, cudaMemcpyHostToDevice);

  struct cudaResourceDesc resDesc;
  memset(&resDesc, 0, sizeof(resDesc));
  resDesc.resType = cudaResourceTypeArray;
  resDesc.res.array.array = cuArray;

  struct cudaTextureDesc texDesc;
  memset(&texDesc, 0, sizeof(texDesc));
  texDesc.addressMode[0]   = cudaAddressModeBorder;
  texDesc.addressMode[1]   = cudaAddressModeBorder;
  texDesc.filterMode       = cudaFilterModeLinear;
  texDesc.readMode         = cudaReadModeElementType;
  texDesc.normalizedCoords = 1;

  cudaTextureObject_t texObj = 0;
  cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);

  std::size_t width_out = 4;
  std::size_t height_out = 4;
  unsigned char* output;
  cudaMalloc(&output, width_out * height_out * n_channels);
  dim3 dimBlock(4, 4);
  dim3 dimGrid(1, 1);
  downsample<<<dimGrid, dimBlock>>>(output, texObj, width_out, height_out);

  cudaDestroyTextureObject(texObj);
  cudaFreeArray(cuArray);

  std::vector<unsigned char> image_out(width_out*height_out*n_channels);
  cudaMemcpy(image_out.data(), output, width_out*height_out*n_channels, cudaMemcpyDeviceToHost);
  // do stuff with image_out

  cudaFree(output);

  return 0;
}

Thanks!

joachim.hodara · August 14, 2020, 8:18am

I just needed to add some error checking to realize that linear filtering is not available on my GPU. cudaFilterModePoint works just fine.

Topic		Replies	Views
Using Textures CUDA Programming and Performance	10	21826	March 29, 2007
cudaSurfaceType3D CUDA Programming and Performance	2	1121	November 2, 2012
no cudaErrors CUDA Programming and Performance	0	2547	October 12, 2007
3D surface memory bug? CUDA Programming and Performance	1	1185	May 21, 2015
3D texture problem CUDA Programming and Performance	0	2553	May 13, 2011
Problem with output of 2D Texture CUDA Programming and Performance	1	798	August 17, 2009
Getting confused when using tex2D CUDA Programming and Performance	2	3600	May 25, 2013
How to use texture mem? The code from programming guide failed on my laptop CUDA Programming and Performance	1	3331	September 24, 2009
Cuda kernel have different results between host machine and docker container CUDA Programming and Performance cuda , kernel	3	260	December 26, 2023
Cpp code runs fine with cuda memcheck but gives zeros without cuda-memcheck CUDA Programming and Performance	2	207	February 8, 2024

Downsampling an image with CUDA

Related topics