Hi,
I have test the following code and the access to global memory on device take a long time.
#define DIMX 5000
#define DIMY 20
#define BLOCK_SIZE 256
__device__ float d_TAB[DIMX * DIMY ]
__global__ void fillTab_GPU(int iDimx, int iDimy, float fD1, float fD2)
{
   const int      tid = blockDim.x * blockIdx.x + threadIdx.x;
   const int THREAD_N = blockDim.x * gridDim.x;
   int i = 0, j = 0;
 Â
   for(i= tid; i< iDimx; i+= THREAD_N)
   {
       for(j = 0; j < iDimy; j++)
       {
           float fMul = fD1 *fD2 + (float) j * fD1;
           int iPos = i * iDimy + j;
           d_TAB[iPos] += 1.0f + fMul;
       }
   }
}
int main(void)
{
   dim3 dimBlock(BLOCK_SIZE);
   dim3 dimGrid( ceil( DIMX/ (float) BLOCK_SIZE));
   float *f_Tab = (float *)calloc(DIMX*DIMY, sizeof(float));
   fillTab_GPU<<< dimGrid, dimBlock>>>(DIMX, DIMY, 2.0f, 1.5f);
   cudaMemcpy(f_Tab , d_TAB, DIMX* DIMY* sizeof(float),             cudaMemcpyDeviceToHost);
}
With Profiler occupency take 0.667 due to the access to global memory on device and i have a big number on gl_incoherent
Finally, what’s the solution to have the most powerful code with keeping the global memory on device d_TAB
THANKS
best Regards
Jonathan