Hi
just to learn something new, (couldnt find the answer with search functions and google) can anyone help me explain these different access speeds:
We try to access lots of float4 respectively double4 at random locations in device memory. Compared to accessing the 4 floats respectively double seperately the kernel using direct access as float4/double4 does not speed up a lot. When accessing them as float4/double4 textures the kernel takes only 33%/40% of the time while using single float / double texture access it takes 70%/80%.
In a table:
access as relativ kernel speed
float 100%
float4 ~100%
float/tex 70%
float4/tex 33%
Best regards
Ceearem
P.S. The doubles are accessed through int2 respectively int4 like this:
#if X_PRECISION == 2
static __inline__ __device__ double tex1Dfetch_double(texture<int2, 1> t, int i)
{
int2 v = tex1Dfetch(t,i);
return __hiloint2double(v.y, v.x);
}
static __inline__ __device__ X_FLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t,2*i);
int4 u = tex1Dfetch(t,2*i+1);
X_FLOAT4 w;
w.x= __hiloint2double(v.y, v.x);
w.y= __hiloint2double(v.w, v.z);
w.z= __hiloint2double(u.y, u.x);
w.w= __hiloint2double(u.w, u.z);
return w;
}
#endif