Optimizing 2-D CUDA code

This is a re-post of an earlier topic. I’ve begun the optimization process of a 2-D matrix iterating program by calling shared memory in the CUDA kernel (not very efficiently, however), but I’m still only getting a few megaflops per second using a GTS 120.

:: If I’m not mistaken, one of the problems is in my grid and block dimensioning; I’ve been searching, but nothing really seems to help.

:: Another issue seems to be in the iterative structure towards the end of the main() code–it seems as though the data is being passed device --> main --> device with every iteration.

:: Additionally, I’m not sure I’ve even coalesced the threads of the matrix yet.

:: Finally, increasing the number of variables I have in my matrix brings an exponential increase in time–for example, a 16x16 takes 0.8 sec to iterate 5000 times, while a 128x128 takes many minutes (I had to kill it around the four-minute mark; who knows how long it would’ve taken to run?) rather than roughly a minute, as it should’ve.

There are doubtlessly other issues; can anyone show me the solutions for one or more of these problems? Feel free to have me clarify anything; I’ll be on this board all day. Thanks.

// CUDAtest2Dexample3.cu

include <stdio.h>

include <stdlib.h>

include <cuda.h>

global void FILENAME(float VAR_device, float ANS_device, size_t pitch_A, size_t pitch_B, unsigned int stride, int N)
{
extern shared float data[];
int x = blockIdx.x
blockDim.x+threadIdx.x;
int y = blockIdx.y
blockDim.y+threadIdx.y;

for (x=0; x<1; x++)
{
for (y=0; y<N; y++)
{
ANS_device[ystride + x] = VAR_device[ystride + x];
}
}
for (x=N-1; x<N; x++)
{
for (y=0; y<N; y++)
{
ANS_device[ystride + x] = VAR_device[ystride + x];
}
}
for (x=1; x<N-1; x++)
{
for (y=0; y<N; y++)
{
ANS_device[ystride + x] = 0.25VAR_device[ystride + x - 1] + 0.5VAR_device[ystride + x] + 0.25VAR_device[y*stride + x + 1];
}
}
}

int main()
{
float *ANS_device, VAR_device;
int N = 16;
float VAR_host[N][N], ANS_host[N][N];
int dimA = N
N;

int numThreadsPerBlock = 256;
int numBlocks = dimA/numThreadsPerBlock;
int sharedMemSize = numThreadsPerBlock;

size_t memSize = N*sizeof(float);

size_t pitch_A, pitch_B;
cudaMallocPitch((void **)(&ANS_device), &pitch_A, memSize, N);
cudaMallocPitch((void **)(&VAR_device), &pitch_B, memSize, N);

unsigned int stride;
stride = pitch_A/sizeof(float);

for (int j=0; j<1; j++)
{
for (int i=0; i<N; i++)
{
VAR_host[i][j] = ((float)i+1)*1000;
}
}

for (int j=1; j<N; j++)
{
for (int i=0; i<N; i++)
{
VAR_host[i][j] = 0;
}
}

cudaMemcpy2D(ANS_device, pitch_A, VAR_host, memSize, memSize, N, cudaMemcpyHostToDevice);

dim3 dimGrid(N/2, N/2);
dim3 dimBlock(2, 2);

int nIterations = 5000;
for(int k=0; k<nIterations; k++)
{
FILENAME <<< numBlocks, dimBlock, sharedMemSize >>> (ANS_device, VAR_device, pitch_A, pitch_B, stride, N);
float *temp = ANS_device;
ANS_device = VAR_device;
VAR_device = temp;
}

cudaMemcpy2D(ANS_host, memSize, VAR_device, pitch_B, memSize, N, cudaMemcpyDeviceToHost);

for (int i=0; i<N; i++)
{
for (int j=0; j<N; j++)
{
printf("%f “, ANS_host[i][j]);
}
printf(”\n");
}

cudaFree(VAR_device);
cudaFree(ANS_device);

return 0;
}