Need Help with Global memory on device

Hi,

I have test the following code and the access to global memory on device take a long time.

#define DIMX 5000 

#define DIMY 20 

#define BLOCK_SIZE 256

__device__ float d_TAB[DIMX * DIMY ]

__global__ void fillTab_GPU(int iDimx, int iDimy, float fD1, float fD2)

{

    const int      tid = blockDim.x * blockIdx.x + threadIdx.x;

 Â Â Â const int THREAD_N = blockDim.x * gridDim.x;

 Â Â Â int i = 0, j = 0;

 Â Â 

 Â Â Â for(i= tid; i< iDimx; i+= THREAD_N)

 Â Â Â {

 Â Â Â Â Â Â Â for(j = 0; j < iDimy; j++)

 Â Â Â Â Â Â Â {

 Â Â Â Â Â Â Â Â Â Â Â float fMul = fD1 *fD2 + (float) j * fD1;

 Â Â Â Â Â Â Â Â Â Â Â int iPos = i * iDimy + j;

 Â Â Â Â Â Â Â Â Â Â Â d_TAB[iPos] += 1.0f + fMul;

 Â Â Â Â Â Â Â }

   }

}

int main(void)

{

 Â Â Â dim3 dimBlock(BLOCK_SIZE);

 Â Â Â dim3 dimGrid( ceil( DIMX/ (float) BLOCK_SIZE));

 Â Â Â float *f_Tab = (float *)calloc(DIMX*DIMY, sizeof(float));

 Â Â Â fillTab_GPU<<< dimGrid, dimBlock>>>(DIMX, DIMY, 2.0f, 1.5f);

   cudaMemcpy(f_Tab , d_TAB, DIMX* DIMY* sizeof(float),             cudaMemcpyDeviceToHost); 

}

With Profiler occupency take 0.667 due to the access to global memory on device and i have a big number on gl_incoherent

Finally, what’s the solution to have the most powerful code with keeping the global memory on device d_TAB

THANKS

best Regards

Jonathan

Please Delete this post because i put it in the right section
Sorry
Thanks
Regards
Jonathan