I am trying to compare the performance of texture fetch and usual memory fetch. I dont know where i am going wrong, The texture fetch is 0.02ms slower than normal memory fetches, but texture fetches are supposed to be faster than global memory access. Could you please help with this
[codebox]
#include<stdio.h>
#include<cuda.h>
texture<int, 1, cudaReadModeElementType> texref;
global void fetchfromtexture(int n, int *a)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx < n)
{
int x = tex1D(texref, idx);
a[idx] = x;
}
}
global void fetchfromram(int n, int *a_d, int *a_d2)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx < n)
a_d2[idx]=a_d[idx];
}
main()
{
int n, *a_h, *a_h2, *a_d, *a_d2, i;
float time, time2;
cudaEvent_t start,stop,start2,stop2;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventCreate(&start2);
cudaEventCreate(&stop2);
printf("Enter array size: ");
scanf(“%d”,&n);
size_t size = sizeof(int)*n;
a_h=(int *)malloc(size);
a_h2=(int *)malloc(size);
for(i=0;i<n;i++)
a_h[i]=i*2;
printf(“\n the array is”);
for(i=0;i<n;i++)
printf("%d ", a_h[i]);
cudaMalloc((void**)&a_d,size);
cudaMalloc((void**)&a_d2,size);
cudaMemset(a_d, 0, size);
cudaMemset(a_d2, 0, size);
cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);
int nb=n/32+((n%32==0)?0:1);
cudaArray* cuArray;
cudaChannelFormatDesc channelDesc=cudaCreateChannelDesc(32,0,0,0,cudaChannelFormat
KindUnsigned);
cudaMallocArray(&cuArray, &channelDesc, n, 1);
cudaMemcpyToArray(cuArray, 0, 0, a_h, size, cudaMemcpyHostToDevice);
texref.filterMode = cudaFilterModePoint;
texref.normalized = false;
texref.addressMode[0] = cudaAddressModeWrap;
texref.addressMode[1] = cudaAddressModeWrap;
cudaBindTextureToArray(texref, cuArray, channelDesc);
cudaEventRecord(start,0);
fetchfromtexture<<<nb,32>>>(n,a_d);
cudaThreadSynchronize();
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop);
cudaEventRecord(start,0);
fetchfromram<<<nb,32>>>(n,a_d,a_d2);
cudaThreadSynchronize();
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time2,start,stop);
printf("\n\n\nThe values fetched from texture is: ");
cudaMemcpy(a_h2, a_d, size, cudaMemcpyDeviceToHost);
for(i=0;i<n;i++)
printf(" %d ", a_h2[i]);
printf(“\nAnd time taken was %.2fms”, time);
for(i=0;i<n;i++)
a_h2[i]=0;
printf("\n\n\nThe values fetched from global memory is: ");
cudaMemcpy(a_h2, a_d2, size, cudaMemcpyDeviceToHost);
for(i=0;i<n;i++)
printf(" %d ", a_h2[i]);
printf(“\nAnd time taken was %.2fms\n”, time2);
cudaFreeArray(cuArray);
cudaUnbindTexture(texref);
return 0;
}
[/codebox]
i coded with reference to the example program given in programming guide