I did some more work on the coalescing; I now have a 12x12x12 running at 600+ MFLOPS. That is, as they say, progress. So thanks a bunch.
Of course, scaling a concept up is never as simple as it seems :wacko:. The code below is my attempt to run a 12x12x16 for 100,000 iterations. The only change is that its configuration is now “NxNxQ,” not NxNxN," but running it produces garbled answers. Is it even possible to run a coalesced matrix with non-equal dimensions (i.e. not “NxNxN”), or am I just missing something?
[codebox]global void FILENAME(float *VAR_device, float *ANS_device, int N, int D, int nIterations)
{
const int P = 12; //
const int Q = 16;
shared float block[QPP];
int xBlock = blockIdx.x*blockDim.x;
int yBlock = blockIdx.y*blockDim.y;
int zBlock = blockIdx.z*blockDim.z;
int x = xBlock + threadIdx.x;
int y = yBlock + threadIdx.y;
int z = zBlock + threadIdx.z;
for(int k=0; k<nIterations; k++)
{
block[z*D*N + y*N + x] = VAR_device[z*D*N + y*N + x];
ANS_device[zDN + yN + x] = block[zDN + yN + x] + 1;
float *temp = ANS_device;
ANS_device = VAR_device;
VAR_device = temp;
}
}
int main()
{
float *ANS_device, *VAR_device;
int N = 12; //
int D = 16; //
int dimA = DNN;
int nIterations = 100000; //
int a = 16; //
int b = 12; //
float VAR_host[D][N][N], ANS_host[D][N][N];
cudaMalloc((void **)(&ANS_device), dimA*sizeof(float));
cudaMalloc((void **)(&VAR_device), dimA*sizeof(float));
for (int i=0; i<N; i++)
{
for (int j=0; j<N; j++)
{
for (int k=0; k<N; k++)
{
VAR_host[i][j][k] = float(j)*float(i) + 1; //
}
}
}
cudaMemcpy(ANS_device, VAR_host, dimA*sizeof(float), cudaMemcpyHostToDevice);
dim3 dimGrid(N*N/b, D/a);
dim3 dimBlock(a, B); // smiley face = “b”
FILENAME <<< dimGrid, dimBlock, NND*sizeof(float) >>> (ANS_device, VAR_device, N, D, nIterations);
cudaMemcpy(ANS_host, ANS_device, NND*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(VAR_device);
cudaFree(ANS_device);
return 0;
}[/codebox]