Hi all,
I am trying to write a simple down-sampling kernel using CUDA. Most of the examples I could find online use texture references, while this article suggests that a more modern approach would be to use texture objects.
I wrote a simple program, but somehow the texture fetch always returns 0. I must be doing something wrong. I checked my normalized coordinates in the kernel (u
/v
) and they all seem reasonable (between 0.0 and 1.0). The error must be related to my texture object I imagine.
Most examples I could find online (including in the NVIDIA programming guide) use cudaMemcpyToArray
, but this function has recently been deprecated. I am using cudaMemcpy2DToArray
here instead.
If someone could let me know what I messed up in the code below, I would greatly appreciate it!
#include <iostream>
#include <vector>
__global__
void downsample(unsigned char* output, cudaTextureObject_t tex, std::size_t width_out, std::size_t height_out) {
int i = blockIdx.x *blockDim.x + threadIdx.x;
int j = blockIdx.y *blockDim.y + threadIdx.y;
if(i >= width_out && j >= height_out)
return;
float u = (i + 0.5f) / width_out;
float v = (j + 0.5f) / height_out;
uchar4 pixel = tex2D<uchar4>(tex, u, v); // <-- pixel = {0, 0, 0, 0}
int globId = (j*width_out + i)*4;
output[globId ] = pixel.x;
output[globId+1] = pixel.y;
output[globId+2] = pixel.z;
output[globId+3] = pixel.w;
}
int main() {
std::size_t width = 256;
std::size_t height = 128;
std::size_t n_channels = 4;
std::vector<unsigned char> image(width*height*n_channels);
for(std::size_t j = 0; j < height; ++j) {
for(std::size_t i = 0; i < width; ++i) {
image[4*(j*width+i)] = 255*j/height;
image[4*(j*width+i)+1] = 255*i/width;
image[4*(j*width+i)+2] = 55;
image[4*(j*width+i)+3] = 255;
}
}
auto channelDesc = cudaCreateChannelDesc<uchar4>();
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, width, height);
cudaMemcpy2DToArray(cuArray, 0, 0, image.data(), width*n_channels, width*n_channels, height, cudaMemcpyHostToDevice);
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeBorder;
texDesc.addressMode[1] = cudaAddressModeBorder;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 1;
cudaTextureObject_t texObj = 0;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
std::size_t width_out = 4;
std::size_t height_out = 4;
unsigned char* output;
cudaMalloc(&output, width_out * height_out * n_channels);
dim3 dimBlock(4, 4);
dim3 dimGrid(1, 1);
downsample<<<dimGrid, dimBlock>>>(output, texObj, width_out, height_out);
cudaDestroyTextureObject(texObj);
cudaFreeArray(cuArray);
std::vector<unsigned char> image_out(width_out*height_out*n_channels);
cudaMemcpy(image_out.data(), output, width_out*height_out*n_channels, cudaMemcpyDeviceToHost);
// do stuff with image_out
cudaFree(output);
return 0;
}
Thanks!