Why are texture memory reads slower than global reads even though it is being accessed spatially?

srinath2468 · June 19, 2020, 11:01am

I had created a test project to compare texture reads vs global reads on a 640*480 image. I use a 3x3 convolution kernel and add all the neighbors to the corresponding pixel and create an output array.

Since texture memory speeds up spatial data reads, I thought i would be getting a high performance gain but rather it is slower than the normal global reads.

These are the results:
(2d refers to texture and normal refers to the global read approach)

946.51us transformKernel2D(int*, unsigned __int64)
599.75us transformKernelNormal(int*, unsigned char*)

This was surprising to me. My code is written below

// Texture approach
__global__ void transformKernel2D(int *output, cudaTextureObject_t texObj) {

  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (tid > 307200)
    return;

  // X and Y coordinates
  int Iy = floorf(tid / 640); // Y index of Matrix
  int Ix = tid % 640;         // X index of Matrix

  if (Ix < 50 || Ix > 600 || Iy < 50 || Iy > 400)
    return;

  int test= 0;
  
  for (int i = -1; i < 2; i++) {
    for (int j = -1; j < 2; j++) {
      if (j == 0 && i == 0)
        continue;

      // Read from texture 
      unsigned char temp1 = tex2D<unsigned char>(texObj, Ix + i, Iy + j);
      test+= (int)temp1;

    }
  }

  output[tid] = test;
}


int width = 480;
  int height = 640;

  byte *d_buffer;
  size_t pitch;
  cudaMallocPitch(&d_buffer, &pitch, sizeof(byte) * width, height);

 
  // Copy to device memory some data located at address h_data
  // in host memory

  byte *h_data;
  int *h_final;
  cudaMallocHost((void **)&h_data, sizeof(byte) * 307200);
  cudaMallocHost((void **)&h_final, sizeof(int) * 307200);
  for (int i = 0; i < 640 * 480; i++) {

    h_data[i] = 1;
    h_final[i] = 0;
  }

  cudaMemcpy2D(d_buffer, pitch, h_data, sizeof(byte) * width,
               sizeof(byte) * width, height, cudaMemcpyHostToDevice);

  struct cudaResourceDesc resDesc;
  memset(&resDesc, 0, sizeof(resDesc));
  resDesc.resType = cudaResourceTypePitch2D;
  resDesc.res.pitch2D.devPtr = d_buffer;
  resDesc.res.pitch2D.pitchInBytes = pitch;
  resDesc.res.pitch2D.width = width;
  resDesc.res.pitch2D.height = height;
  resDesc.res.pitch2D.desc = cudaCreateChannelDesc<unsigned char>();

  struct cudaTextureDesc texDesc;
  memset(&texDesc, 0, sizeof(texDesc));
  texDesc.addressMode[0] = cudaAddressModeBorder;
  texDesc.addressMode[1] = cudaAddressModeBorder;
  texDesc.filterMode = cudaFilterModePoint;
  texDesc.readMode = cudaReadModeElementType;
  texDesc.normalizedCoords = 0;

  cudaTextureObject_t tex;
  cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);

  // Allocate result of transformation in device memory
  int *output;
  cudaMalloc(&output, sizeof(int) * 307200);

  // Invoke kernel

  int blocks = ceil(307200 / 128.0);
  transformKernel2D<<<blocks, 128>>>(output, tex);

/ Simple transformation kernel
__global__ void transformKernelNormal(int *output, unsigned char *image) {

  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (tid > 307200)
    return;

  // X and Y coordinates
  int Iy = floorf(tid / 640); // Y index of Matrix
  int Ix = tid % 640;         // X index of Matrix

  if (Ix < 50 || Ix > 600 || Iy < 50 || Iy > 400)
    return;

  // // finding Sobel and covariance

  int test = 0;
  int index = 0;

  for (int i = -1; i < 2; i++) {
    for (int j = -1; j < 2; j++) {
      if (j == 0 && i == 0)
        continue;

      int neighbourId = ((Iy + j) * 640) + (Ix + i);
      test += image[neighbourId];
     }
  }

 
  output[tid] = test;
}


size_t size = 307200 * sizeof(byte);
  // Allocate CUDA array in device memory
  byte *d_image;
  cudaMalloc((void **)&d_image, size);

  cudaMemcpy(d_image, h_data, size, cudaMemcpyHostToDevice);

  // Allocate result of transformation in device memory
  int *output3;
  cudaMalloc((void **)&output3, sizeof(int) * 307200);

  // Invoke kernel

  transformKernelNormal<<<blocks, 128>>>(output3, d_image);

Is there something wrong with the kernel? I feel like texture should give me better performaces