I have a 2-D iterating code that works, but very, very slowly (I timed it and it seems to be operating at several hundred kiloflops, maybe a megaflop). Does anyone see something wrong with it, or is the fault perhaps with the computer running it?
include <stdio.h>
include <stdlib.h>
include <cuda.h>
global void FILENAME(float VAR_device, float ANS_device, size_t pitch_A, size_t pitch_B, unsigned int stride, int N)
{
int x = blockIdx.xblockDim.x+threadIdx.x;
int y = blockIdx.yblockDim.y+threadIdx.y;
for (x=0; x<1; x++)
{
for (y=0; y<N; y++)
{
ANS_device[ystride + x] = VAR_device[ystride + x];
}
}
for (x=N-1; x<N; x++)
{
for (y=0; y<N; y++)
{
ANS_device[ystride + x] = VAR_device[ystride + x];
}
}
for (x=1; x<N-1; x++)
{
for (y=0; y<N; y++)
{
ANS_device[ystride + x] = 0.25VAR_device[ystride + x - 1] + 0.5VAR_device[ystride + x] + 0.25VAR_device[y*stride + x + 1];
}
}
}
int main()
{
int N = 16;
float *ANS_device, VAR_device;
size_t size = Nsizeof(float);
float VAR_host[N][N], ANS_host[N][N];
size_t pitch_A, pitch_B;
cudaMallocPitch((void **)(&ANS_device), &pitch_A, size, N);
cudaMallocPitch((void **)(&VAR_device), &pitch_B, size, N);
unsigned int stride;
stride = pitch_A/sizeof(float);
for (int j=0; j<1; j++)
{
for (int i=0; i<N; i++)
{
VAR_host[i][j] = 300;
}
}
for (int j=1; j<N; j++)
{
for (int i=0; i<N; i++)
{
VAR_host[i][j] = 0;
}
}
cudaMemcpy2D(ANS_device, pitch_A, VAR_host, size, size, N, cudaMemcpyHostToDevice);
dim3 dimBlock(N/2, N/2);
dim3 nBlocks(2, 2);
int nIterations = 5000;
for(int k=0; k<nIterations; k++)
{
FILENAME <<< nBlocks, dimBlock >>> (ANS_device, VAR_device, pitch_A, pitch_B, stride, N);
float *temp = ANS_device;
ANS_device = VAR_device;
VAR_device = temp;
}
cudaMemcpy2D(ANS_host, size, VAR_device, pitch_B, size, N, cudaMemcpyDeviceToHost);
for (int i=0; i<N; i++)
{
for (int j=0; j<N; j++)
{
printf(“%f “, ANS_host[i][j]);
}
printf(”\n”);
}
cudaFree(VAR_device);
cudaFree(ANS_device);
return 0;
}
Thanks in advance for any replies.