Taking 2-D iterating code to 3-D

Hey all,

I have a 2-D iterating code that runs at 600-800 MFLOPS on a GTS 120 (bad, but a heckuva lot faster than the C version), but I’ve hit a problem. When I convert this 2-D code to 3-D, everything goes haywire; it spits out an answer, but the profiler reveals that the kernel won’t even run. What’s puzzling me is that the 2-D code I have below seems to lend itself very nicely to conversion to 3-D, since none of the copy/call/run/etc. commands seem to be for one particular dimension (i.e. I’m not using a cudaMemcpy2D or anything like that)–of course, CUDA is never that easy. The 2-D code is posted first, followed by the 3-D version. I’m working on the problem as you read this, but any suggestions would be greatly appreciated. As always, thanks in advance for any replies.

P.S. And yes, I’ve tested the 2-D code and all indications are that it works perfectly.

2-D version:

[codebox]// CUDAtest2Dexample3.cu

include <stdio.h>

include <stdlib.h>

include <cuda.h>

global void FILENAME(float *VAR_device, float *ANS_device, int N)

{

int x = blockIdx.x*blockDim.x + threadIdx.x;

int y = blockIdx.y*blockDim.y + threadIdx.y;

if (x==0 || x==N-1)

{

ANS_device[y*N + x] = VAR_device[y*N + x];

}

else

{

ANS_device[y*N + x] = 0.25*VAR_device[y*N + x - 1] + 0.5*VAR_device[y*N + x] + 0.25*VAR_device[y*N + x + 1];

}

__syncthreads();

}

int main()

{

float *ANS_device, *VAR_device;

int N = 1024;

int nIterations = 100;

int a = 32;

int b = 16;

float VAR_host[N][N], ANS_host[N][N];

cudaMalloc((void **)(&ANS_device), NNsizeof(float));

cudaMalloc((void **)(&VAR_device), NNsizeof(float));

for (int j=0; j<1; j++)

{

for (int i=0; i<N; i++)

{

  VAR_host[i][j] = (float)i + 2;

}

}

for (int j=1; j<N; j++)

{

for (int i=0; i<N; i++)

{

  VAR_host[i][j] = 1;

}

}

printf("\n");

for (int i=N-1; i<N; i++)

{

for (int j=0; j<N; j++)

{

  printf("%f   ", VAR_host[i][j]);

}

printf("\n");

}

printf("\n");

cudaMemcpy(ANS_device, VAR_host, NNsizeof(float), cudaMemcpyHostToDevice);

dim3 dimGrid(N/a, N/b);

dim3 dimBlock(a, B);

for(int k=0; k<nIterations; k++)

{

FILENAME <<< dimGrid, dimBlock >>> (ANS_device, VAR_device, N);

float *temp = ANS_device;

ANS_device = VAR_device;

VAR_device = temp;

}

cudaMemcpy(ANS_host, VAR_device, NNsizeof(float), cudaMemcpyDeviceToHost);

printf("\n");

for (int i=N-2; i<N; i++)

{

for (int j=0; j<N; j++)

{

  printf("%f   ", ANS_host[i][j]);

}

printf("\n\n");

}

printf("\n");

cudaFree(VAR_device);

cudaFree(ANS_device);

return 0;

}[/codebox]

3-D version:

[codebox]// CUDAtest2Dexample3.cu

include <stdio.h>

include <stdlib.h>

include <cuda.h>

global void FILENAME(float *VAR_device, float *ANS_device, int N)

{

int x = blockIdx.x*blockDim.x + threadIdx.x;

int y = blockIdx.y*blockDim.y + threadIdx.y;

int z = blockIdx.z*blockDim.z + threadIdx.z;

ANS_device[zNN + yN + x] = VAR_device[zNN + yN + x];

__syncthreads();

}

int main()

{

float *ANS_device, *VAR_device;

int N = 16;

int nIterations = 100;

int a = 32;

int b = 16;

float VAR_host[N][N][N], ANS_host[N][N][N];

cudaMalloc((void **)(&ANS_device), NNN*sizeof(float));

cudaMalloc((void **)(&VAR_device), NNN*sizeof(float));

for (int j=0; j<1; j++)

{

for (int i=0; i<N; i++)

{

  for (int k=0; k<N; k++)

  {

    VAR_host[i][j][k] = 1;

  }

}

}

for (int j=1; j<N; j++)

{

for (int i=0; i<N; i++)

{

  for (int k=0; k<N; k++)

  {

    VAR_host[i][j][k] = 0;

  }

}

}

printf("\n");

for (int i=N-1; i<N; i++)

{

for (int j=0; j<N; j++)

{

  for (int k=0; k<N; k++)

  {

    printf("%f   ", VAR_host[i][j][k]);

  }

  printf("\n");

}

printf("\n");

}

printf("\n");

cudaMemcpy(ANS_device, VAR_host, NNN*sizeof(float), cudaMemcpyHostToDevice);

dim3 dimGrid(N/a, N/b, 1);

dim3 dimBlock(a, b, 1);

for(int g=0; g<nIterations; g++)

{

FILENAME <<< dimGrid, dimBlock >>> (ANS_device, VAR_device, N);

float *temp = ANS_device;

ANS_device = VAR_device;

VAR_device = temp;

}

cudaMemcpy(ANS_host, VAR_device, NNN*sizeof(float), cudaMemcpyDeviceToHost);

printf("\n");

for (int i=N-1; i<N; i++)

{

for (int j=0; j<N; j++)

{

  for (int k=0; k<N; k++)

  {

    printf("%f   ", ANS_host[i][j][k]);

  }

  printf("\n\n");

}

printf("\n\n");

}

printf("\n");

cudaFree(VAR_device);

cudaFree(ANS_device);

return 0;

}[/codebox]

The 3D code has 512 threads per block. It’s possible you’ve run out of registers on the multiprocessor to execute a block that big. Can you compile your code with the “–ptxas-options=-v” option passed to nvcc? That will tell you the number of registers per threads your kernel needs.