I’m trying to benchmark CUDA performance in performing a Sobel operator using texture memory.
but i’m getting a dissappointing performance result.
performing Sobel operator on
600 x 847 pixel JPG image took ~50ms
1000 x 1280 pixel PNG image took ~120ms
1820 x 2570 pixel JPG image took ~440ms
i dont know what causing the performance drop. And wether if i’m using a correct block size
here’s my main.cpp code
#include <stdlib.h>
#include <iostream>
#include <string.h>
#include <Windows.h>
#include <opencv2\core\core.hpp>
#include <opencv2\highgui\highgui.hpp>
#include <cuda_runtime.h>
#include <cuda_gl_interop.h>
#include "CudaKernel.h"
using namespace cv;
using namespace std;
IplImage* gpusobel(IplImage *image){
IplImage* image2=cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
IplImage* floatimage=cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
//Convert the input image to float
cvConvert(image,floatimage);
float *output=(float*)image2->imageData;
float *input=(float*)floatimage->imageData;
kernelcall(input, output, floatimage->width,floatimage->height, floatimage->widthStep);
cvScale(image2,image2,1.0/255.0);
return image2;
}
void devquery(cudaDeviceProp devProp)
{
printf("Name: %s\n", devProp.name);
printf("Total global memory: %u\n", devProp.totalGlobalMem);
printf("Total shared memory per block: %u\n", devProp.sharedMemPerBlock);
printf("Total registers per block: %d\n", devProp.regsPerBlock);
printf("Warp size: %d\n", devProp.warpSize);
printf("Maximum threads per block: %d\n", devProp.maxThreadsPerBlock);
printf("Clock rate: %d\n", devProp.clockRate);
printf("Total constant memory: %u\n", devProp.totalConstMem);
printf("Texture alignment: %u\n", devProp.textureAlignment);
printf("Concurrent copy and execution: %s\n", (devProp.deviceOverlap ? "Yes" : "No"));
printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount);
return;
}
int main(int argc, char** argv)
{
IplImage* image;
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, 0);
devquery(devProp);
IplImage* result;
try
{
image = cvLoadImage("images/1359301895642.jpg", CV_LOAD_IMAGE_GRAYSCALE);
}
catch (const cv::Exception* ex)
{
cout << "Error: " << ex->what() << endl;
}
if(!image )
{
cout << "Could not open or find the image" << std::endl ;
return -1;
}
/* old code, switched to function
image2=cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
image3=cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
//Convert the input image to float
cvConvert(image,image3);
float *output=(float*)image2->imageData;
float *input=(float*)image3->imageData;
kernelcall(input, output, image3->width,image3->height, image3->widthStep);
cvScale(image2,image2,1.0/255.0);
*/
result = gpusobel(image);
cvShowImage( "Original Image", image ); // Show our image inside it.
cvShowImage("Sobeled Image", result);
waitKey(0); // Wait for a keystroke in the window
return 0;
}
and my .cu file
#include<cuda.h>
#include<iostream>
#include "CudaKernel.h"
using namespace std;
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
texture <float,2,cudaReadModeElementType> tex1;
static cudaArray *cuArray = NULL;
//Kernel for x direction sobel
__global__ void implement_x_sobel(float* output,int width,int height,int widthStep)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
//Make sure that thread is inside image bounds
if(x<width && y<height)
{
float output_value = (-1*tex2D(tex1,x-1,y-1)) + (0*tex2D(tex1,x,y-1)) + (1*tex2D(tex1,x+1,y-1))
+ (-2*tex2D(tex1,x-1,y)) + (0*tex2D(tex1,x,y)) + (2*tex2D(tex1,x+1,y))
+ (-1*tex2D(tex1,x-1,y+1)) + (0*tex2D(tex1,x,y+1)) + (1*tex2D(tex1,x+1,y+1))
+ (-1*tex2D(tex1,x-1,y-1)) + (-2*tex2D(tex1,x,y-1)) + (-1*tex2D(tex1,x+1,y-1))
+ (0*tex2D(tex1,x-1,y)) + (0*tex2D(tex1,x,y)) + (0*tex2D(tex1,x+1,y))
+ (1*tex2D(tex1,x-1,y+1)) + (2*tex2D(tex1,x,y+1)) + (1*tex2D(tex1,x+1,y+1));
output[y*widthStep+x]=output_value;
}
}
//Kernel for y direction sobel
__global__ void implement_y_sobel(float* output,int width,int height,int widthStep)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
//Make sure that thread is inside image bounds
if(x<width && y<height)
{
float output_value = (-1*tex2D(tex1,x-1,y-1)) + (-2*tex2D(tex1,x,y-1)) + (-1*tex2D(tex1,x+1,y-1))
+ (0*tex2D(tex1,x-1,y)) + (0*tex2D(tex1,x,y)) + (0*tex2D(tex1,x+1,y))
+ (1*tex2D(tex1,x-1,y+1)) + (2*tex2D(tex1,x,y+1)) + (1*tex2D(tex1,x+1,y+1)); ;
output[y*widthStep+x]=output_value;
}
}
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
file, line, (int)err, cudaGetErrorString( err ) );
exit(-1);
}
}
//Host Code
inline void __cudaSafeCall( cudaError err, const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
if ( cudaSuccess != err )
{
printf("cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
inline void __cudaCheckError( const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if ( cudaSuccess != err )
{
printf("cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif
return;
}
void kernelcall(float* input,float* output,int width,int height,int widthStep){
cudaEvent_t start,stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//cudaChannelFormatDesc channelDesc=cudaCreateChannelDesc(32,32,0,0,cudaChannelFormatKindFloat);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
CudaSafeCall(cudaMallocArray(&cuArray,&channelDesc,width,height));
//cudaMemcpyToArray(cuArray,0,0,input,widthStep*height,cudaMemcpyHostToDevice);
cudaMemcpy2DToArray(cuArray,0,0,input,widthStep,width * sizeof(float),height,cudaMemcpyHostToDevice);
tex1.addressMode[0]=cudaAddressModeClamp;
tex1.addressMode[1]=cudaAddressModeClamp;
tex1.filterMode=cudaFilterModeLinear;
cudaBindTextureToArray(tex1,cuArray,channelDesc);
tex1.normalized=false;
float * D_output_x;
float * D_output_y;
CudaSafeCall(cudaMalloc(&D_output_x,widthStep*height));
CudaSafeCall(cudaMalloc(&D_output_y,widthStep*height));
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(width+blocksize.x-1)/blocksize.x;
gridsize.y=(height+blocksize.y-1)/blocksize.y;
cudaEventRecord(start, 0);
printf("----- Executing CUDA kernel ----\n");
implement_x_sobel<<<gridsize,blocksize>>>(D_output_x,width,height,widthStep/sizeof(float));
cudaEventRecord(stop, 0);
cudaThreadSynchronize();
CudaCheckError();
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf ("Kernel Execution Complete!!\n");
printf ("Image size : %d x %d pixel\n", height,width);
printf ("Time for the kernel: %f ms\n", time);
cudaUnbindTexture(tex1);
CudaSafeCall(cudaMemcpy(output,D_output_x,height*widthStep,cudaMemcpyDeviceToHost));
cudaFree(D_output_x);
cudaFreeArray(cuArray);
}
forgot to mention, my GPU is 9800M GS