Hi,
I have test the following code and the access to global memory on device take a long time.
#define DIMX 5000
#define DIMY 20
#define BLOCK_SIZE 256
__device__ float d_TAB[DIMX * DIMY ]
__global__ void fillTab_GPU(int iDimx, int iDimy, float fD1, float fD2)
{
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
const int THREAD_N = blockDim.x * gridDim.x;
int i = 0, j = 0;
for(i= tid; i< iDimx; i+= THREAD_N)
{
for(j = 0; j < iDimy; j++)
{
float fMul = fD1 *fD2 + (float) j * fD1;
int iPos = i * iDimy + j;
d_TAB[iPos] += 1.0f + fMul;
}
}
}
int main(void)
{
dim3 dimBlock(BLOCK_SIZE);
dim3 dimGrid( ceil( DIMX/ (float) BLOCK_SIZE));
float *f_Tab = (float *)calloc(DIMX*DIMY, sizeof(float));
fillTab_GPU<<< dimGrid, dimBlock>>>(DIMX, DIMY, 2.0f, 1.5f);
cudaMemcpy(f_Tab , d_TAB, DIMX* DIMY* sizeof(float), cudaMemcpyDeviceToHost);
}
With Profiler occupency take 0.667 due to the access to global memory on device and i have a big number on gl_incoherent
Finally, what’s the solution to have the most powerful code with keeping the global memory on device d_TAB
THANKS
best Regards
Jonathan