Need Help with Global memory on device

Hi,

I have test the following code and the access to global memory on device take a long time.

#define DIMX 5000 

#define DIMY 20 

#define BLOCK_SIZE 256

__device__ float d_TAB[DIMX * DIMY ]

__global__ void fillTab_GPU(int iDimx, int iDimy, float fD1, float fD2)

{

    const int      tid = blockDim.x * blockIdx.x + threadIdx.x;

    const int THREAD_N = blockDim.x * gridDim.x;

    int i = 0, j = 0;

   

    for(i= tid; i< iDimx; i+= THREAD_N)

    {

        for(j = 0; j < iDimy; j++)

        {

            float fMul = fD1 *fD2 + (float) j * fD1;

            int iPos = i * iDimy + j;

            d_TAB[iPos] += 1.0f + fMul;

        }

   }

}

int main(void)

{

    dim3 dimBlock(BLOCK_SIZE);

    dim3 dimGrid( ceil( DIMX/ (float) BLOCK_SIZE));

    float *f_Tab = (float *)calloc(DIMX*DIMY, sizeof(float));

    fillTab_GPU<<< dimGrid, dimBlock>>>(DIMX, DIMY, 2.0f, 1.5f);

   cudaMemcpy(f_Tab , d_TAB, DIMX* DIMY* sizeof(float),             cudaMemcpyDeviceToHost); 

}

With Profiler occupency take 0.667 due to the access to global memory on device and i have a big number on gl_incoherent

Finally, what’s the solution to have the most powerful code with keeping the global memory on device d_TAB

THANKS

best Regards

Jonathan

I think you want to do it like this:

#define DIMX 5000 

#define DIMY 20 

#define BLOCK_SIZE 256

__device__ float d_TAB[DIMX * DIMY ]

__global__ void fillTab_GPU(int iDimx, int iDimy, float fD1, float fD2)

{

    const int      tid = blockDim.x * blockIdx.x + threadIdx.x;

    const int THREAD_N = blockDim.x * gridDim.x;

    int j = 0;

   if (tid < iDimx)

    {

        for(j = 0; j < iDimy; j++)

        {

            float fMul = fD1 * (fD2 + (float) j);

            int iPos = i * iDimy + j;

            d_TAB[iPos] += 1.0f + fMul;

        }

   }

}

int main(void)

{

    dim3 dimBlock(BLOCK_SIZE);

    dim3 dimGrid( ceil( DIMX/ (float) BLOCK_SIZE));

    float *f_Tab = (float *)calloc(DIMX*DIMY, sizeof(float));

    fillTab_GPU<<< dimGrid, dimBlock>>>(DIMX, DIMY, 2.0f, 1.5f);

   cudaMemcpy(f_Tab , d_TAB, DIMX* DIMY* sizeof(float),             cudaMemcpyDeviceToHost); 

}

Thanks, but the problem is not here, the line d_Tab(iPos) causes big gl incoherent
So, i would like to have a clean solution with uses of global memory on the device
Thanks
Best regards
Jonathan

No idea to clean this kernel ?
Thanks
Regards
J

UP !!!

just do it the brute-force way:

 for(i= tid; i< iDimx * iDimy; i+= THREAD_N) {

    d_TAB[tid] += 1 + fD1 * (fD2 + (i % iDimy));

  }

Yes, modulo is very slow, but your code is likely to be bound by memory bandwidth anyway - if not you can use a shift by making iDimy a power of two and using a 2D memcpy to do the padding.