Slow Performance while performing Sobel Operator using Texture Memory

I’m trying to benchmark CUDA performance in performing a Sobel operator using texture memory.

but i’m getting a dissappointing performance result.

performing Sobel operator on

600 x 847 pixel JPG image took ~50ms
1000 x 1280 pixel PNG image took ~120ms
1820 x 2570 pixel JPG image took ~440ms

i dont know what causing the performance drop. And wether if i’m using a correct block size

here’s my main.cpp code

#include <stdlib.h>
#include <iostream>
#include <string.h>
#include <Windows.h>
#include <opencv2\core\core.hpp>
#include <opencv2\highgui\highgui.hpp>
#include <cuda_runtime.h>
#include <cuda_gl_interop.h>
#include "CudaKernel.h"

using namespace cv;
using namespace std;

IplImage* gpusobel(IplImage *image){

	IplImage* image2=cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
	IplImage* floatimage=cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
	
	//Convert the input image to float
	cvConvert(image,floatimage);

	float *output=(float*)image2->imageData;
	float *input=(float*)floatimage->imageData;

	kernelcall(input, output, floatimage->width,floatimage->height, floatimage->widthStep);
	cvScale(image2,image2,1.0/255.0);
	return image2;

}

void devquery(cudaDeviceProp devProp)
{
    printf("Name:                          %s\n",  devProp.name);
    printf("Total global memory:           %u\n",  devProp.totalGlobalMem);
    printf("Total shared memory per block: %u\n",  devProp.sharedMemPerBlock);
    printf("Total registers per block:     %d\n",  devProp.regsPerBlock);
    printf("Warp size:                     %d\n",  devProp.warpSize);
    printf("Maximum threads per block:     %d\n",  devProp.maxThreadsPerBlock);
    printf("Clock rate:                    %d\n",  devProp.clockRate);
    printf("Total constant memory:         %u\n",  devProp.totalConstMem);
    printf("Texture alignment:             %u\n",  devProp.textureAlignment);
    printf("Concurrent copy and execution: %s\n",  (devProp.deviceOverlap ? "Yes" : "No"));
    printf("Number of multiprocessors:     %d\n",  devProp.multiProcessorCount);
    return;
}

int main(int argc, char** argv) 
{
	IplImage* image;
	cudaDeviceProp devProp;
	cudaGetDeviceProperties(&devProp, 0);
	devquery(devProp);
	IplImage* result;
	try
    {
		image = cvLoadImage("images/1359301895642.jpg", CV_LOAD_IMAGE_GRAYSCALE);	
    }
    catch (const cv::Exception* ex)
    {
        cout << "Error: " << ex->what() << endl;
    }
	if(!image )
		{
			 cout << "Could not open or find the image" << std::endl ;
			 return -1;
		}

	

/* old code, switched to function
	image2=cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);
	image3=cvCreateImage(cvGetSize(image),IPL_DEPTH_32F,image->nChannels);

	//Convert the input image to float
	cvConvert(image,image3);

	float *output=(float*)image2->imageData;
	float *input=(float*)image3->imageData;

	kernelcall(input, output, image3->width,image3->height, image3->widthStep);

	cvScale(image2,image2,1.0/255.0);
	*/ 

	result = gpusobel(image);
	cvShowImage( "Original Image", image ); // Show our image inside it.
	cvShowImage("Sobeled Image", result);

	waitKey(0); // Wait for a keystroke in the window
	return 0;
	
}

and my .cu file

#include<cuda.h>
#include<iostream>
#include "CudaKernel.h"
using namespace std;
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)

texture <float,2,cudaReadModeElementType> tex1;
static cudaArray *cuArray = NULL;

//Kernel for x direction sobel
__global__ void implement_x_sobel(float* output,int width,int height,int widthStep)
{
	int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    //Make sure that thread is inside image bounds
    if(x<width && y<height)
    {
        float output_value = (-1*tex2D(tex1,x-1,y-1)) + (0*tex2D(tex1,x,y-1)) + (1*tex2D(tex1,x+1,y-1))
                           + (-2*tex2D(tex1,x-1,y))   + (0*tex2D(tex1,x,y))   + (2*tex2D(tex1,x+1,y))
                           + (-1*tex2D(tex1,x-1,y+1)) + (0*tex2D(tex1,x,y+1)) + (1*tex2D(tex1,x+1,y+1))
						   
						   + (-1*tex2D(tex1,x-1,y-1)) + (-2*tex2D(tex1,x,y-1)) + (-1*tex2D(tex1,x+1,y-1))
                           + (0*tex2D(tex1,x-1,y))   + (0*tex2D(tex1,x,y))   + (0*tex2D(tex1,x+1,y))
                           + (1*tex2D(tex1,x-1,y+1)) + (2*tex2D(tex1,x,y+1)) + (1*tex2D(tex1,x+1,y+1));

        output[y*widthStep+x]=output_value;
    }
}

//Kernel for y direction sobel
__global__ void implement_y_sobel(float* output,int width,int height,int widthStep)
{
	int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    //Make sure that thread is inside image bounds
    if(x<width && y<height)
    {
        float output_value = (-1*tex2D(tex1,x-1,y-1)) + (-2*tex2D(tex1,x,y-1)) + (-1*tex2D(tex1,x+1,y-1))
                           + (0*tex2D(tex1,x-1,y))   + (0*tex2D(tex1,x,y))   + (0*tex2D(tex1,x+1,y))
                           + (1*tex2D(tex1,x-1,y+1)) + (2*tex2D(tex1,x,y+1)) + (1*tex2D(tex1,x+1,y+1));						   ;

        output[y*widthStep+x]=output_value;
    }
}

inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
    if( cudaSuccess != err) {
	    fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
                file, line, (int)err, cudaGetErrorString( err ) );
        exit(-1);
    }
}

//Host Code
 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
if ( cudaSuccess != err )
{
    printf("cudaSafeCall() failed at %s:%i : %s\n",
             file, line, cudaGetErrorString( err ) );
    exit( -1 );
}    
#endif

return;
}
inline void __cudaCheckError( const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if ( cudaSuccess != err )
{
    printf("cudaCheckError() failed at %s:%i : %s\n",
             file, line, cudaGetErrorString( err ) );
   exit( -1 );
}
#endif

return;
}

void kernelcall(float* input,float* output,int width,int height,int widthStep){
	cudaEvent_t start,stop;
	float time;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	//cudaChannelFormatDesc channelDesc=cudaCreateChannelDesc(32,32,0,0,cudaChannelFormatKindFloat);
	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

	CudaSafeCall(cudaMallocArray(&cuArray,&channelDesc,width,height));
	//cudaMemcpyToArray(cuArray,0,0,input,widthStep*height,cudaMemcpyHostToDevice);
	cudaMemcpy2DToArray(cuArray,0,0,input,widthStep,width * sizeof(float),height,cudaMemcpyHostToDevice);

	tex1.addressMode[0]=cudaAddressModeClamp;
	tex1.addressMode[1]=cudaAddressModeClamp;
	tex1.filterMode=cudaFilterModeLinear;

	cudaBindTextureToArray(tex1,cuArray,channelDesc);
	tex1.normalized=false;
	float * D_output_x;
	float * D_output_y;
	CudaSafeCall(cudaMalloc(&D_output_x,widthStep*height)); 
	CudaSafeCall(cudaMalloc(&D_output_y,widthStep*height)); 
	dim3 blocksize(16,16);
	dim3 gridsize;
	gridsize.x=(width+blocksize.x-1)/blocksize.x;
	gridsize.y=(height+blocksize.y-1)/blocksize.y;

	cudaEventRecord(start, 0);
	printf("----- Executing CUDA kernel ----\n");
	implement_x_sobel<<<gridsize,blocksize>>>(D_output_x,width,height,widthStep/sizeof(float));
	cudaEventRecord(stop, 0);

	cudaThreadSynchronize();
	CudaCheckError();
	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&time, start, stop);
	
	printf ("Kernel Execution Complete!!\n");
	printf ("Image size : %d x %d pixel\n", height,width);
	printf ("Time for the kernel: %f ms\n", time);

	cudaUnbindTexture(tex1);
	CudaSafeCall(cudaMemcpy(output,D_output_x,height*widthStep,cudaMemcpyDeviceToHost));
	cudaFree(D_output_x);
	cudaFreeArray(cuArray);
}

forgot to mention, my GPU is 9800M GS

Where do you see performance drop?

it took 440ms just to process a 1820 x 2570 image, is this normal ?

And what is your system? What are its parameters? Memory bandwidth for example.

My PC is using
Core2Duo p7450 2,13Ghz
4GB DDR2 RAM

my CUDA Device is GeForce 9800M GS
512 MB Dedicated Memory
64 CUDA core
core clock 530 MHz
Memory clock 799MHz (1598MHz data rate)

according to the bandwidth test that comes with CUDA SDK example, my memory bandwidth is

host to device : 1218.8 MB/s
device to host : 954 MB/s
device to device : 30231 MB/s

I am not sure what you mean by “performance drop”. Your largest image contains 9.2 times the number of pixels of your smallest image, and this ratio corresponds closely to the ratio of the run times for the two images, which is 440 / 50 = 8.8.

i’m not sure about this, but isn’t this rather slow for CUDA computation ?

I do not have experience in the area of image processing and therefore have no idea what performance one should expect. I note that the GPU used is fairly low end. I would suggest using the profiler to zero in on potential problem areas and to identify potential bottlenecks.

Image size is 1820 x 2570=5MB pixels - 20Mbyte, you have 2000 images processed in second, each pixel requires a few reading from memory and writing, 20Mbyte*2000=40GB per second about like theoretical memory bandwidth.

yeah i realize that my GPU is a mobile version of GeForce, so my problem lies in the hardware, not my code ?

how about my code, is there any problem with my code ? what i’m afraid is i wrote a inefficient code and/or using improper kernel block size.

You may play with block size and check different. Is 440ms is 0.0004 s? Btw, in your list you have double code in implement_x_sobel. Btw, try to use byte instead of float, it may speed up things.

Unless there is a typo, “ms” = “millisecond” = “1/1000 of a second”, so one frame at 1820 x 2570 apparently takes 0.440 seconds, for a throughput of slightly better than two frames per second.

i was trying to separate between x direction sobel operator and y direction sobel operator, but i changed my mind and combine it in a single kernel, the implement_y_sobel wasnt called at the code, i forgot to delete it.

How to use byte ? can i just change the declaration ?

yes ms as in millisecond, the kernel need 0,44 second to complete.