Hi Everybody,
I am new to CUDA and just started experimenting with it. Using
OpenCV, I tried to implement a benchmark for measuring the time
needed to fill an 5MP image with a constant value. I am compiling
with VS 2010 for x64, using a GTX 560 TI.
My “Release” timing results are:
[i]
CPU Set: 3.145791 (ms)
GPU Set: 31.395227 (ms)
[/i]
Since the GPU version is so slow, I must have forgotten some important points.
I would appreciate it, if someone could point it out to me.
Regards!
Code (file:main.cpp):
extern "C" void cuSetValue(byte*, int, int, size_t, byte);
int main(){
StopWatch myTimer;
unsigned int timer;
const int FILL_VALUE = 128;
try {
cv::Mat image = cv::imread("a1_Referenz.png",0);
// Serial version using openCV
myTimer.startTimer();
image = FILL_VALUE;
myTimer.stopTimer();
printf( "CPU Set: %f (ms)\n", myTimer.getElapsedTime());
// CUDA Version, first call is somehow slow...
cuSetValue(image.data, image.rows, image.cols, image.total()*image.elemSize(), FILL_VALUE);
// hence time second call...
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
cuSetValue(image.data, image.rows, image.cols, image.total()*image.elemSize(), FILL_VALUE);
CUT_SAFE_CALL(cutStopTimer(timer));
printf( "GPU Set: %f (ms)\n", cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer))
} catch(const cv::Exception& e) {
printf("%s", e.what());
}
while (1) if ('\n' == getchar()) break;
return 0;
}
(file: cuSetValue.cu)
__global__ void myKernel( unsigned char *ptr, unsigned char value ) {
// map from threadIdx/BlockIdx to pixel position
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x + y * gridDim.x;
ptr[offset] = value;
}
extern "C" void cuSetValue(unsigned char* image, int width, int height, size_t byteCount, unsigned char value) {
unsigned char* pImgD;
cutilSafeCall( cudaMalloc( (void**)&pImg_d, width*height ) );
dim3 grid(height,width);
myKernel<<< grid,1 >>>( pImgD, value );
cutilSafeCall( cudaMemcpy( image, pImgD, byteCount, cudaMemcpyDeviceToHost ) );
cudaFree( pImg_d );
}
Command Line nvcc