This is a re-post of an earlier topic. I have a test code that gives me the right answer, but very, very slowly–I’m talking kiloflops. Here are some of my novice issues:
:: One problem seems to be in my grid and block dimensioning; I’ve been experimenting, but nothing really seems to help.
:: I’m not sure I’ve coalesced the threads of the matrix yet.
:: On top of this, I have a sneaking suspicion I should be using “if” rather than “for” statements in the CUDA kernel…but so far all tries (none of which are implemented in the version below) have yielded garbage answers (This angle looks especially promising for increasing the speed, but I have no way of confirming this).
Could anyone show me the solutions for one or more of these problems? (feel free to have me clarify anything). Thanks!
[codebox]// # include <stdio.h>
include <stdlib.h>
include <cuda.h>
global void FILENAME(float *VAR_device, float *ANS_device, size_t pitch, size_t pitch, unsigned int stride, int N)
{
extern shared float data;
int x = blockIdx.x*blockDim.x+threadIdx.x;
int y = blockIdx.y*blockDim.y+threadIdx.y;
for (x=0; x<1; x++)
{
for (y=0; y<N; y++)
{
ANS_device[y*stride + x] = VAR_device[y*stride + x];
}
}
for (x=N-1; x<N; x++)
{
for (y=0; y<N; y++)
{
ANS_device[y*stride + x] = VAR_device[y*stride + x];
}
}
for (x=1; x<N-1; x++)
{
for (y=0; y<N; y++)
{
ANS_device[y*stride + x] = 0.25*VAR_device[y*stride + x - 1] + 0.5*VAR_device[y*stride + x] + 0.25*VAR_device[y*stride + x + 1];
}
}
}
int main()
{
float *ANS_device, *VAR_device;
int N = 16;
float VAR_host[N][N], ANS_host[N][N];
int dimA = N*N;
int numThreadsPerBlock = 256;
int numBlocks = dimA/numThreadsPerBlock;
int sharedMemSize = numThreadsPerBlock;
size_t memSize = N*sizeof(float);
size_t pitch;
cudaMallocPitch((void **)(&ANS_device), &pitch, memSize, N);
cudaMallocPitch((void **)(&VAR_device), &pitch, memSize, N);
unsigned int stride;
stride = pitch/sizeof(float);
for (int j=0; j<1; j++)
{
for (int i=0; i<N; i++)
{
VAR_host[i][j] = ((float)i+1)*1000;
}
}
for (int j=1; j<N; j++)
{
for (int i=0; i<N; i++)
{
VAR_host[i][j] = 0;
}
}
cudaMemcpy2D(ANS_device, pitch, VAR_host, memSize, memSize, N, cudaMemcpyHostToDevice);
dim3 dimGrid(N/2, N/2);
dim3 dimBlock(2, 2);
int nIterations = 5000;
for(int k=0; k<nIterations; k++)
{
FILENAME <<< numBlocks, dimBlock, sharedMemSize >>> (ANS_device, VAR_device, pitch, pitch, stride, N);
float *temp = ANS_device;
ANS_device = VAR_device;
VAR_device = temp;
}
cudaMemcpy2D(ANS_host, memSize, VAR_device, pitch, memSize, N, cudaMemcpyDeviceToHost);
cudaFree(VAR_device);
cudaFree(ANS_device);
return 0;
}[/codebox]