Hi there,
I have some troubles using texture cache on GPU.
My situation is: I have one square matrix of 16 unsigned char (4x4) and many threads. Each thread need to operate on one single item of this matrix and with other kind of data. In other words I have a number (possibly big) of blocks and each block has 4x4x32 threads (these are the three dimensions of one single block: x=4, y=4 and z=32). Assuming that the matrix on device memory is called “d_data” each thread need to operate on the unsigned char in position
[codebox]#include <stdio.h>
#include <cutil_inline.h>
#define ROUNDS 100 // iterations to average
texture <unsigned char, 1, cudaReadModeElementType> tex;
dim3 block = dim3(4,4,32);
dim3 grid = dim3(2000);
global void kernel(); // kernel that uses texture cache
global void kernelStd(unsigned char *d_data); // kernel with “standard” (i.e. no texture cache) data copy
int main(int argc, char **argv) {
unsigned char *data = (unsigned char *) malloc(16 * sizeof(unsigned char));
for (unsigned char i = 0; i < 16; i++) data[i] = 0xff - i; // init of data
unsigned char *d_data;
cudaMalloc((void **) &d_data, 16);
cudaMemcpy(d_data, data, 16, cudaMemcpyHostToDevice);
double t_avg = 0;
unsigned int timer;
// measuring computation time with texture cache "disabled"
for (unsigned int round = 0; round < ROUNDS; round++) {
cutCreateTimer(&timer);
cutStartTimer(timer);
kernelStd<<<grid,block>>>(d_data);
cudaThreadSynchronize();
cutStopTimer(timer);
t_avg += cutGetTimerValue(timer) / (double) ROUNDS;
cutDeleteTimer(timer);
}
printf("%d rounds without texture cache. Average time: %f ms\n", ROUNDS, t_avg);
t_avg = 0;
// measuring computation time using texture cache
for (unsigned int round = 0; round < ROUNDS; round++) {
cutCreateTimer(&timer);
cutStartTimer(timer);
cudaBindTexture(0, tex, d_data, 16);
kernel<<<grid,block>>>();
cudaThreadSynchronize();
cudaUnbindTexture(tex);
cutStopTimer(timer);
t_avg += cutGetTimerValue(timer) / (double) ROUNDS;
cutDeleteTimer(timer);
}
printf("%d rounds using texture cache. Average time: %f ms\n", ROUNDS, t_avg);
cudaFree(d_data);
free(data);
return 0;
}
global void kernel() {
unsigned char myData = tex1Dfetch(tex, threadIdx.x + blockDim.x * threadIdx.y);
myData += 1;
}
global void kernelStd(unsigned char *d_data) {
unsigned char myData = d_data[threadIdx.x + blockDim.x * threadIdx.y];
myData += 1;
}
[/codebox]
thanks for your answers
M.M.