Hey there,
I’m hoping to use CUDA on an image processing project: a very simple kernel that calculates the intensity of each pixel by summing each color channel (RGB) together. Each thread will handle one pixel of the image (1600x1200) and I’ve tried passing the pixel data in as either a uchar4 type or else I’ve tried creating a 2D array in global device memory using cudaMallocPitch() and cudaMemcpy2D() and passing in a pointer to that image array.
However, even if commenting out all the instructions of the kernel, the GPU is always several times slower than a reference implementation running on the CPU. I’ve included some dummy code that- while the data type being passed is different, I’m allocating memory in my program the same way.
I’m using Visual Studio 2008. I’ve ran the following code on a laptop Quadro FX 570M. The CPU took .05sec, when the GPU portion ran, the screen blinked black and took ~6seconds. I copied the Visual Studio project to a desktop Tesla C1060 and the CPU was .04sec while the GPU jumped to ~20seconds.
I can’t understand the performance differences. I’m not sure if I’m allocating blocks and threads in the correct dimensions. Changing the array size to less than 512 causes the Quadro laptop to take 17seconds on the GPU while the Tesla GPU executes in .5secs and the CPU in 0.00sec. I’m not sure if there are some Visual Studio issues occuring either. Changing between debug and release mode affects only the GPU speed (Running the template bandwidth test in debug mode causes the device<->device bandwidth to slow to .5GB/sec). Also, I’m not sure if the VS compiler settings are being correctly set/managed.
I’m really lost on what can be done to manage performance. Any help would be incredibly appreciated.
#include <stdio.h>
#include <cuda.h>
void CPU(int n, int alpha, int *x, int *y);
int iDivUp(int a, int b) // Round a / b to nearest higher integer value
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
int iAlignUp(int a, int b) // Align a to nearest higher multiple of b
{ return (a % b != 0) ? (a - a % b + b) : a; }
/** Kernel Definition **/
__global__ void GPU(int n, int alpha, int *x, int *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n) y[i] = alpha*x[i] + y[i];
}
void CPU(int n, int alpha, int *x, int *y)
{ for (int i=0; i<n; i++) y[i] = alpha*x[i] + y[i]; }
/** Host Definition **/
int main(void)
{
cudaSetDevice(0);
int timingLoops=10000;
int n=5000;
int arraySize = n*sizeof(int);
int *x = (int *) malloc(arraySize);
for (int i=0; i<n; i++) x[i] = 1;
int *x_devMem;
cudaMalloc( (void**)&x_devMem, arraySize);
cudaMemcpy(x_devMem, x, arraySize, cudaMemcpyHostToDevice);
int *y = (int *) malloc(arraySize);
int *y_devMem;
cudaMalloc( (void**)&y_devMem, arraySize);
/** CPU Calculation **/
clock_t startTimeCPU, endTimeCPU;
startTimeCPU = clock();
for (int i=0; i<timingLoops; i++)
CPU(n,2,x,y);
endTimeCPU = clock();
double secondsCPU = ((double)endTimeCPU - (double)startTimeCPU) / (double)CLOCKS_PER_SEC;
printf ("Seconds to completion (CPU): %f\n", secondsCPU);
system("pause");
/** GPU Calculation **/
clock_t startTimeGPU, endTimeGPU;
startTimeGPU = clock();
dim3 dimBlock(8,16);
dim3 dimGrid( iDivUp(n,dimBlock.x), iDivUp(n,dimBlock.y) );
for (int i=0; i<timingLoops; i++)
GPU<<<dimGrid, dimBlock>>>(n,2,x_devMem,y_devMem);
endTimeGPU = clock();
double secondsGPU = ((double)endTimeGPU - (double)startTimeGPU) / (double)CLOCKS_PER_SEC;
printf ("\nSeconds to completion (GPU): %f\n", secondsGPU);
system("pause");
cudaMemcpy(y, y_devMem, arraySize, cudaMemcpyDeviceToHost);
cudaFree(x_devMem);
cudaFree(y_devMem);
free(x);
free(y);
return 0;
}