I’m having issues with CUDA allocating the wrong number of blocks to a grid. It gives me the right answer, but when I run the code, the profiler ignores my “N/2 x N/2” block size specification and sets up a block size of 4x1. Of course, this means that it runs the code VERY slowly (about 5 MFLOPS on a GTS 120). Any ideas?
Profiler output is attached as a JPEG Thanks in advance for any answers.
[codebox]# include <stdio.h>
include <stdlib.h>
include <cuda.h>
global void FILENAME(float *VAR_device, float *ANS_device, size_t pitch, size_t pitch, unsigned int stride, int N)
{
extern shared float data;
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
for (x=0; x<1; x++)
{
for (y=0; y<N; y++)
{
ANS_device[y*stride + x] = VAR_device[y*stride + x];
}
}
for (x=N-1; x<N; x++)
{
for (y=0; y<N; y++)
{
ANS_device[y*stride + x] = VAR_device[y*stride + x];
}
}
for (x=1; x<N-1; x++)
{
for (y=0; y<N; y++)
{
ANS_device[y*stride + x] = 0.25*VAR_device[y*stride + x - 1] + 0.5*VAR_device[y*stride + x] + 0.25*VAR_device[y*stride + x + 1];
}
}
}
int main()
{
float *ANS_device, *VAR_device;
int N = 32;
int nIterations = 5000;
float VAR_host[N][N], ANS_host[N][N];
int dimA = N*N;
int numThreadsPerBlock = 256;
int numBlocks = dimA/numThreadsPerBlock;
int sharedMemSize = numThreadsPerBlock;
size_t memSize = N*sizeof(float);
size_t pitch;
cudaMallocPitch((void **)(&ANS_device), &pitch, memSize, N);
cudaMallocPitch((void **)(&VAR_device), &pitch, memSize, N);
unsigned int stride;
stride = pitch/sizeof(float);
for (int j=0; j<1; j++)
{
for (int i=0; i<N; i++)
{
VAR_host[i][j] = ((float)i+1)*5;
}
}
for (int j=1; j<N; j++)
{
for (int i=0; i<N; i++)
{
VAR_host[i][j] = 0;
}
}
cudaMemcpy2D(ANS_device, pitch, VAR_host, memSize, memSize, N, cudaMemcpyHostToDevice);
dim3 dimGrid(N/2, N/2);
dim3 dimBlock(2, 2);
for(int k=0; k<nIterations; k++)
{
FILENAME <<< numBlocks, dimBlock, sharedMemSize >>> (ANS_device, VAR_device, pitch, pitch, stride, N);
float *temp = ANS_device;
ANS_device = VAR_device;
VAR_device = temp;
}
cudaMemcpy2D(ANS_host, memSize, VAR_device, pitch, memSize, N, cudaMemcpyDeviceToHost);
cudaFree(VAR_device);
cudaFree(ANS_device);
}[/codebox]