Hey all,
I have a 2-D iterating code that runs at 600-800 MFLOPS on a GTS 120 (bad, but a heckuva lot faster than the C version), but I’ve hit a problem. When I convert this 2-D code to 3-D, everything goes haywire; it spits out an answer, but the profiler reveals that the kernel won’t even run. What’s puzzling me is that the 2-D code I have below seems to lend itself very nicely to conversion to 3-D, since none of the copy/call/run/etc. commands seem to be for one particular dimension (i.e. I’m not using a cudaMemcpy2D or anything like that)–of course, CUDA is never that easy. The 2-D code is posted first, followed by the 3-D version. I’m working on the problem as you read this, but any suggestions would be greatly appreciated. As always, thanks in advance for any replies.
P.S. And yes, I’ve tested the 2-D code and all indications are that it works perfectly.
2-D version:
[codebox]// CUDAtest2Dexample3.cu
include <stdio.h>
include <stdlib.h>
include <cuda.h>
global void FILENAME(float *VAR_device, float *ANS_device, int N)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x==0 || x==N-1)
{
ANS_device[y*N + x] = VAR_device[y*N + x];
}
else
{
ANS_device[y*N + x] = 0.25*VAR_device[y*N + x - 1] + 0.5*VAR_device[y*N + x] + 0.25*VAR_device[y*N + x + 1];
}
__syncthreads();
}
int main()
{
float *ANS_device, *VAR_device;
int N = 1024;
int nIterations = 100;
int a = 32;
int b = 16;
float VAR_host[N][N], ANS_host[N][N];
cudaMalloc((void **)(&ANS_device), NNsizeof(float));
cudaMalloc((void **)(&VAR_device), NNsizeof(float));
for (int j=0; j<1; j++)
{
for (int i=0; i<N; i++)
{
VAR_host[i][j] = (float)i + 2;
}
}
for (int j=1; j<N; j++)
{
for (int i=0; i<N; i++)
{
VAR_host[i][j] = 1;
}
}
printf(“\n”);
for (int i=N-1; i<N; i++)
{
for (int j=0; j<N; j++)
{
printf("%f ", VAR_host[i][j]);
}
printf("\n");
}
printf(“\n”);
cudaMemcpy(ANS_device, VAR_host, NNsizeof(float), cudaMemcpyHostToDevice);
dim3 dimGrid(N/a, N/b);
dim3 dimBlock(a, B);
for(int k=0; k<nIterations; k++)
{
FILENAME <<< dimGrid, dimBlock >>> (ANS_device, VAR_device, N);
float *temp = ANS_device;
ANS_device = VAR_device;
VAR_device = temp;
}
cudaMemcpy(ANS_host, VAR_device, NNsizeof(float), cudaMemcpyDeviceToHost);
printf(“\n”);
for (int i=N-2; i<N; i++)
{
for (int j=0; j<N; j++)
{
printf("%f ", ANS_host[i][j]);
}
printf("\n\n");
}
printf(“\n”);
cudaFree(VAR_device);
cudaFree(ANS_device);
return 0;
}[/codebox]
3-D version:
[codebox]// CUDAtest2Dexample3.cu
include <stdio.h>
include <stdlib.h>
include <cuda.h>
global void FILENAME(float *VAR_device, float *ANS_device, int N)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
int z = blockIdx.z*blockDim.z + threadIdx.z;
ANS_device[zNN + yN + x] = VAR_device[zNN + yN + x];
__syncthreads();
}
int main()
{
float *ANS_device, *VAR_device;
int N = 16;
int nIterations = 100;
int a = 32;
int b = 16;
float VAR_host[N][N][N], ANS_host[N][N][N];
cudaMalloc((void **)(&ANS_device), NNN*sizeof(float));
cudaMalloc((void **)(&VAR_device), NNN*sizeof(float));
for (int j=0; j<1; j++)
{
for (int i=0; i<N; i++)
{
for (int k=0; k<N; k++)
{
VAR_host[i][j][k] = 1;
}
}
}
for (int j=1; j<N; j++)
{
for (int i=0; i<N; i++)
{
for (int k=0; k<N; k++)
{
VAR_host[i][j][k] = 0;
}
}
}
printf(“\n”);
for (int i=N-1; i<N; i++)
{
for (int j=0; j<N; j++)
{
for (int k=0; k<N; k++)
{
printf("%f ", VAR_host[i][j][k]);
}
printf("\n");
}
printf("\n");
}
printf(“\n”);
cudaMemcpy(ANS_device, VAR_host, NNN*sizeof(float), cudaMemcpyHostToDevice);
dim3 dimGrid(N/a, N/b, 1);
dim3 dimBlock(a, b, 1);
for(int g=0; g<nIterations; g++)
{
FILENAME <<< dimGrid, dimBlock >>> (ANS_device, VAR_device, N);
float *temp = ANS_device;
ANS_device = VAR_device;
VAR_device = temp;
}
cudaMemcpy(ANS_host, VAR_device, NNN*sizeof(float), cudaMemcpyDeviceToHost);
printf(“\n”);
for (int i=N-1; i<N; i++)
{
for (int j=0; j<N; j++)
{
for (int k=0; k<N; k++)
{
printf("%f ", ANS_host[i][j][k]);
}
printf("\n\n");
}
printf("\n\n");
}
printf(“\n”);
cudaFree(VAR_device);
cudaFree(ANS_device);
return 0;
}[/codebox]