help with CUDA 3D-texture

[font=“Courier”]
3D texture buffer property :
CUDA_MEMCPY3D mem3D;
mem3D.dstMemoryType=CU_MEMORYTYPE_ARRAY;
mem3D.dstArray =devArray;
mem3D.dstLOD =0;
mem3D.srcMemoryType=CU_MEMORYTYPE_HOST;
mem3D.srcLOD =0;
mem3D.srcPitch =SIZE*sizeof(float4);
mem3D.srcHost =hPtr;
mem3D.WidthInBytes =mem3D.srcPitch;
mem3D.Height =SIZE;
mem3D.Depth =LAYS;
//mem3D.reserved0 =0;
//mem3D.reserved1 =0;
memset(&mem3D,0,sizeof(mem3D));
CU_SAFE_CALL(cuMemcpy3D(&mem3D));
and texref:
CU_SAFE_CALL(cuTexRefSetArray(texRef,devArray,CU_TRSA_OVERRIDE_FORMAT));
CU_SAFE_CALL(cuTexRefSetFormat(texRef,CU_AD_FORMAT_FLOAT,4));
CU_SAFE_CALL(cuTexRefSetAddressMode(texRef,0,CU_TR_ADDRESS_MODE_CLAMP));
CU_SAFE_CALL(cuTexRefSetAddressMode(texRef,1,CU_TR_ADDRESS_MODE_CLAMP));
CU_SAFE_CALL(cuTexRefSetAddressMode(texRef,2,CU_TR_ADDRESS_MODE_CLAMP));
CU_SAFE_CALL(cuTexRefSetFilterMode(texRef,CU_TR_FILTER_MODE_LINEAR));[/font]
but all result is zero <img src=‘http://hqnveipbwb20/public/style_emoticons/<#EMO_DIR#>/crying.gif’ class=‘bbc_emoticon’ alt=‘:’(’ />

this`s my kernel:

[b]
#if USE_CUDA_RT
#define CALLkernel global void
#else
#define CALLkernel extern “C” global void
#endif

#if USE_FAST_MATH
#define IMUL(X,Y) __umul24(X,Y)
#else
#define IMUL(X,Y) (X/Y)
#endif

#if USE_CONST_CACHE
constant float dt;
#endif

#define SIZE 128

texture<float4,3,cudaReadModeElementType> velfield;

CALLkernel kernel_fluid_advect(
float3* oBuffer
#if !USE_CONST_CACHE
,float dt
#endif
)
{
unsigned int gtidx=IMUL(blockDim.x,blockIdx.x)+threadIdx.x;
unsigned int gtidy=IMUL(blockDim.y,blockIdx.y)+threadIdx.y;
unsigned int layer=IMUL(gridDim.x,blockIdx.y>>3)+(blockIdx.x>>3);
unsigned int sizex=IMUL(gridDim.x,blockDim.x);
float3 loc;
loc.x=(float)(gtidx&(SIZE-1))+0.5f;
loc.y=(float)(gtidy&(SIZE-1))+0.5f;
loc.z=(float)layer+0.5f;
float4 vel;
vel=tex3D(velfield,loc.x,loc.y,loc.z);
loc.x-=dtvel.x;
loc.y-=dt
vel.y;
loc.z-=dt*vel.z;
vel=tex3D(velfield,loc.x,loc.y,loc.z);
unsigned int index=IMUL(gtidy,sizex)+gtidx;
oBuffer[index].x=vel.x;
oBuffer[index].y=vel.y;
oBuffer[index].z=vel.z;
}[/b]

I used texture size is:128x128x64
and the blockShape is BLOCK_DIMxBLOCK_DIM (is 16)
gridDim is GRID_SIZE/BLOCK_DIM x GRID_SIZE/BLOCK_DIM
GRID_SIZE:1024