I had created a test project to compare texture reads vs global reads on a 640*480 image. I use a 3x3 convolution kernel and add all the neighbors to the corresponding pixel and create an output array.
Since texture memory speeds up spatial data reads, I thought i would be getting a high performance gain but rather it is slower than the normal global reads.
These are the results:
(2d refers to texture and normal refers to the global read approach)
946.51us transformKernel2D(int*, unsigned __int64)
599.75us transformKernelNormal(int*, unsigned char*)
This was surprising to me. My code is written below
// Texture approach
__global__ void transformKernel2D(int *output, cudaTextureObject_t texObj) {
int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid > 307200)
return;
// X and Y coordinates
int Iy = floorf(tid / 640); // Y index of Matrix
int Ix = tid % 640; // X index of Matrix
if (Ix < 50 || Ix > 600 || Iy < 50 || Iy > 400)
return;
int test= 0;
for (int i = -1; i < 2; i++) {
for (int j = -1; j < 2; j++) {
if (j == 0 && i == 0)
continue;
// Read from texture
unsigned char temp1 = tex2D<unsigned char>(texObj, Ix + i, Iy + j);
test+= (int)temp1;
}
}
output[tid] = test;
}
int width = 480;
int height = 640;
byte *d_buffer;
size_t pitch;
cudaMallocPitch(&d_buffer, &pitch, sizeof(byte) * width, height);
// Copy to device memory some data located at address h_data
// in host memory
byte *h_data;
int *h_final;
cudaMallocHost((void **)&h_data, sizeof(byte) * 307200);
cudaMallocHost((void **)&h_final, sizeof(int) * 307200);
for (int i = 0; i < 640 * 480; i++) {
h_data[i] = 1;
h_final[i] = 0;
}
cudaMemcpy2D(d_buffer, pitch, h_data, sizeof(byte) * width,
sizeof(byte) * width, height, cudaMemcpyHostToDevice);
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = d_buffer;
resDesc.res.pitch2D.pitchInBytes = pitch;
resDesc.res.pitch2D.width = width;
resDesc.res.pitch2D.height = height;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<unsigned char>();
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeBorder;
texDesc.addressMode[1] = cudaAddressModeBorder;
texDesc.filterMode = cudaFilterModePoint;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 0;
cudaTextureObject_t tex;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
// Allocate result of transformation in device memory
int *output;
cudaMalloc(&output, sizeof(int) * 307200);
// Invoke kernel
int blocks = ceil(307200 / 128.0);
transformKernel2D<<<blocks, 128>>>(output, tex);
/ Simple transformation kernel
__global__ void transformKernelNormal(int *output, unsigned char *image) {
int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tid > 307200)
return;
// X and Y coordinates
int Iy = floorf(tid / 640); // Y index of Matrix
int Ix = tid % 640; // X index of Matrix
if (Ix < 50 || Ix > 600 || Iy < 50 || Iy > 400)
return;
// // finding Sobel and covariance
int test = 0;
int index = 0;
for (int i = -1; i < 2; i++) {
for (int j = -1; j < 2; j++) {
if (j == 0 && i == 0)
continue;
int neighbourId = ((Iy + j) * 640) + (Ix + i);
test += image[neighbourId];
}
}
output[tid] = test;
}
size_t size = 307200 * sizeof(byte);
// Allocate CUDA array in device memory
byte *d_image;
cudaMalloc((void **)&d_image, size);
cudaMemcpy(d_image, h_data, size, cudaMemcpyHostToDevice);
// Allocate result of transformation in device memory
int *output3;
cudaMalloc((void **)&output3, sizeof(int) * 307200);
// Invoke kernel
transformKernelNormal<<<blocks, 128>>>(output3, d_image);
Is there something wrong with the kernel? I feel like texture should give me better performaces