My test code… (windows)
!!d.cu
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
#define N 2097152
#define M 4
#define nth 256
struct str0{
int *src;
int n;
int stride;
};
texture<int,1> texsrc;
__global__ static void sum3_v0(str0 p)
{
int *src=p.src;
int n=p.n;
int stride=p.stride;
int bid=blockIdx.x;
int thid=threadIdx.x;
int id=bid*nth+thid;
if(id>=n)return;
//src[id*4]+=src[id*4+1]+src[id*4+2]+src[id*4+3];
//src[(id^1234)]=src[id+stride];
//int sid=id;id<<=2;
//src[sid]=tex1Dfetch(texsrc,id)+tex1Dfetch(texsrc,id+1)+tex1Dfetch(texsrc,id+2)+tex1Dfetch(texsrc,id+3);
src[id]=tex1Dfetch(texsrc,id)+tex1Dfetch(texsrc,id+stride)+tex1Dfetch(texsrc,id+stride*2)+tex1Dfetch(texsrc,id+stride*3);
}
__global__ static void sum3_v1(int *src,int n,int stride)
{
int bid=blockIdx.x;
int thid=threadIdx.x;
int id=bid*nth+thid;
if(id>=n)return;
//src[id]=tex1Dfetch(texsrc,id)+tex1Dfetch(texsrc,id+stride)+tex1Dfetch(texsrc,id+stride*2)+tex1Dfetch(texsrc,id+stride*3);
//src[id*4]=tex1Dfetch(texsrc,id*4)+tex1Dfetch(texsrc,id*4+1)+tex1Dfetch(texsrc,id*4+2)+tex1Dfetch(texsrc,id*4+3);
//src[id+stride]+=src[bid]+src[id+stride*2]+src[id+stride*3];
//src[id]+src[id+stride]+src[id+stride*2]+src[id+stride*3];
src[id]=src[id]+src[id+stride]+src[id+stride*2]+src[id+stride*3];
//src[id]=src[(id^1234)+stride];
//src[id]=tex1Dfetch(texsrc,id)+tex1Dfetch(texsrc,id+stride)+tex1Dfetch(texsrc,id+stride*2)+tex1Dfetch(texsrc,id+stride*3);
}
int main()
{
int *a,*b;
int *da,*db;
cudaMalloc((void**)&da,sizeof(int)*N*M);a=(int*)malloc(sizeof(int)*N*M);
cudaMalloc((void**)&db,sizeof(int)*N*M);b=(int*)malloc(sizeof(int)*N*M);
for(int i=0;i<M*N;i++)
a[i]=i/M;
for(int i=0;i<M*N;i++)
b[i]=i%N;
cudaMemcpy(da,a,sizeof(int)*N*M,cudaMemcpyHostToDevice);
cudaMemcpy(db,b,sizeof(int)*N*M,cudaMemcpyHostToDevice);
cudaBindTexture(0,texsrc,db,4*N*M);
int tryn=100;
int t0=GetTickCount();
str0 s;
s.src=da;
s.n=N;
s.stride=N;
for(int i=0;i<tryn;i++)
sum3_v0<<<(N+nth-1)/nth,nth,0>>>(s);
int t1=GetTickCount();
for(int i=0;i<tryn;i++)
sum3_v1<<<(N+nth-1)/nth,nth,0>>>(db,N,N);
int t2=GetTickCount();
printf("v0: %.6lfms\n",(double)(t1-t0)/tryn);
printf("v1: %.6lfms\n",(double)(t2-t1)/tryn);
cudaUnbindTexture(texsrc);
cudaFree(db);
cudaFree(da);
return 0;
}
Now the performance suddenly became the same on my machine!?
Maybe last time I used a poor access pattern.
I dunno whether my benchmark will be useful. Better test oneself to be sure.