Downsampling an image with CUDA

Hi all,

I am trying to write a simple down-sampling kernel using CUDA. Most of the examples I could find online use texture references, while this article suggests that a more modern approach would be to use texture objects.

I wrote a simple program, but somehow the texture fetch always returns 0. I must be doing something wrong. I checked my normalized coordinates in the kernel (u/v) and they all seem reasonable (between 0.0 and 1.0). The error must be related to my texture object I imagine.

Most examples I could find online (including in the NVIDIA programming guide) use cudaMemcpyToArray, but this function has recently been deprecated. I am using cudaMemcpy2DToArray here instead.

If someone could let me know what I messed up in the code below, I would greatly appreciate it!

#include <iostream>
#include <vector>

__global__ 
void downsample(unsigned char* output, cudaTextureObject_t tex, std::size_t width_out, std::size_t height_out) {
  int i = blockIdx.x *blockDim.x + threadIdx.x;
  int j = blockIdx.y *blockDim.y + threadIdx.y;

  if(i >= width_out && j >= height_out) 
    return;

  float u = (i + 0.5f) / width_out;
  float v = (j + 0.5f) / height_out;

  uchar4 pixel = tex2D<uchar4>(tex, u, v); // <-- pixel = {0, 0, 0, 0}

  int globId = (j*width_out + i)*4;
  output[globId  ] = pixel.x;
  output[globId+1] = pixel.y;
  output[globId+2] = pixel.z;
  output[globId+3] = pixel.w;
}

int main() {
  std::size_t width = 256;
  std::size_t height = 128;
  std::size_t n_channels = 4;

  std::vector<unsigned char> image(width*height*n_channels);
  for(std::size_t j = 0; j < height; ++j) {
    for(std::size_t i = 0; i < width; ++i) {
      image[4*(j*width+i)] = 255*j/height;
      image[4*(j*width+i)+1] = 255*i/width;
      image[4*(j*width+i)+2] = 55;
      image[4*(j*width+i)+3] = 255;
    }
  }

  auto channelDesc = cudaCreateChannelDesc<uchar4>();
  cudaArray* cuArray;
  cudaMallocArray(&cuArray, &channelDesc, width, height);
  cudaMemcpy2DToArray(cuArray, 0, 0, image.data(), width*n_channels, width*n_channels, height, cudaMemcpyHostToDevice);

  struct cudaResourceDesc resDesc;
  memset(&resDesc, 0, sizeof(resDesc));
  resDesc.resType = cudaResourceTypeArray;
  resDesc.res.array.array = cuArray;

  struct cudaTextureDesc texDesc;
  memset(&texDesc, 0, sizeof(texDesc));
  texDesc.addressMode[0]   = cudaAddressModeBorder;
  texDesc.addressMode[1]   = cudaAddressModeBorder;
  texDesc.filterMode       = cudaFilterModeLinear;
  texDesc.readMode         = cudaReadModeElementType;
  texDesc.normalizedCoords = 1;

  cudaTextureObject_t texObj = 0;
  cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);

  std::size_t width_out = 4;
  std::size_t height_out = 4;
  unsigned char* output;
  cudaMalloc(&output, width_out * height_out * n_channels);
  dim3 dimBlock(4, 4);
  dim3 dimGrid(1, 1);
  downsample<<<dimGrid, dimBlock>>>(output, texObj, width_out, height_out);

  cudaDestroyTextureObject(texObj);
  cudaFreeArray(cuArray);

  std::vector<unsigned char> image_out(width_out*height_out*n_channels);
  cudaMemcpy(image_out.data(), output, width_out*height_out*n_channels, cudaMemcpyDeviceToHost);
  // do stuff with image_out

  cudaFree(output);

  return 0;
}

Thanks!

I just needed to add some error checking to realize that linear filtering is not available on my GPU. cudaFilterModePoint works just fine.