Problem of compiling OpenCV with CUDA

An error occurs when compiling the attached two example codes using the following steps.

gcc -lcv -lhighgui -lcvaux -I/usr/include/opencv -c q_cpu.c

nvcc -c -arch=sm_20 -Xcudafe=-w q_gpu.cu

nvcc L/usr/local/cuda/lib64 -L/usr/local/lib -L/usr/lib -L/usr/lib32 -L/usr/lib64 q_gpu.o q_cpu.o -lcv -lhighgui -lcvaux -o test

The error message is as follows.

q_cpu.o: In function `main’:

q_cpu.c:(.text+0x275): undefined reference to `run_GPU’

collect2: ld returned 1 exit status

make: *** [f] Error 1

I will appreciate if I can get any response.

// q_cpu.c

#include "cv.h"

#include "highgui.h"

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#define VALUE_MAX 10000000.0

int*run_GPU(char *s_img, int s_h, int s_w, char *p_img, int p_h, int p_w, int r_h, int r_w);

struct match{

	int bestRow;

	int bestCol;

	int bestSAD;

}position;

int main( int argc, char** argv )

{

	IplImage* sourceImg; 

	IplImage* patternImg; 

	float minSAD = VALUE_MAX;

	float SAD;

	int x, y, i, j;

	uchar* ptr;

	uchar p_sourceIMG, p_patternIMG;

	CvPoint pt1, pt2;

	int *host_result;

	int result_height, result_width;

	if( argc != 3 )

	{

	    printf("Using command: %s source_image search_image\n",argv[0]);

		exit(1);

	}

	if((sourceImg = cvLoadImage( argv[1], 0)) == NULL){

		printf("%s cannot be openned\n",argv[1]);

		exit(1);

	}

	printf("height of sourceImg:%d\n",sourceImg->height);

	printf("width of sourceImg:%d\n",sourceImg->width);

	printf("size of sourceImg:%d\n",sourceImg->imageSize);

	

	

	if((patternImg = cvLoadImage( argv[2], 0)) == NULL){

		printf("%s cannot be openned\n",argv[2]);

		exit(1);

	}    

	printf("height of sourceImg:%d\n",patternImg->height);

	printf("width of sourceImg:%d\n",patternImg->width);

	printf("size of sourceImg:%d\n",patternImg->imageSize);

	

	//allocate memory on CPU to store SAD results

	result_height = sourceImg->height - patternImg->height + 1;

	result_width = sourceImg->width - patternImg->width + 1;

	

    host_result = run_GPU(sourceImg->imageData, sourceImg->height, sourceImg->width,

        patternImg->imageData, patternImg->height, patternImg->width, result_height, result_width);

	for( y=0; y < result_height; y++ ) {

		for( x=0; x < result_width; x++ ) {

			if ( minSAD > host_result[y * result_width + x] ) {

				minSAD =  host_result[y * result_width + x];

				// give me VALUE_MAX

				position.bestRow = y;

				position.bestCol = x;

				position.bestSAD =  host_result[y * result_width + x];

			}

			

		}

	}

	

	printf("minSAD is %f\n", minSAD);

	//setup the two points for the best match

    pt1.x = position.bestCol;

    pt1.y = position.bestRow;

    pt2.x = pt1.x + patternImg->width;

    pt2.y = pt1.y + patternImg->height;

// Draw the rectangle in the source image

    cvRectangle( sourceImg, pt1, pt2, CV_RGB(255,0,0), 3, 8, 0 );

			

	cvNamedWindow( "sourceImage", 1 );

    cvShowImage( "sourceImage", sourceImg );

	cvNamedWindow( "patternImage", 1 );

    cvShowImage( "patternImage", patternImg );

	

		

    cvWaitKey(0); 

cvDestroyWindow( "sourceImage" );

    cvReleaseImage( &sourceImg );

	cvDestroyWindow( "patternImage" );

    cvReleaseImage( &patternImg );

    return 0;

}
// q_gpu.cu

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include <cuda_runtime.h>

#define iABS(x) (((x)<0)?(-(x)):(x))

#define iAbsDiff(a,b) (((a)<(b))?((b)-(a)):((a)-(b)))

__global__ void kernel (unsigned char* sourcePtr, unsigned char* patternPtr, int* resultPtr, int s_height, int s_width, int p_height, int p_width);

int *run_GPU(char *s_img, int s_h, int s_w, char *p_img, int p_h, int p_w, int r_h, int r_w) {

    unsigned char *dev_sourceImg, *dev_patternImg; 

	int *dev_result;

    int *host_result;

host_result = (int *)malloc(r_h * r_w * sizeof(int));

//allocate the momory on GPU	

	cudaMalloc((void**)&dev_sourceImg, s_h * s_w * sizeof(char));	

	cudaMalloc((void**)&dev_patternImg, p_h * p_w * sizeof(char));	

	cudaMalloc((void**)&dev_result, r_h * r_w * sizeof(int));	

	

	//copy source and pattern image to GPU

	cudaMemcpy(dev_sourceImg, 

	           s_img, 

			   s_h * s_w * sizeof(char),

			   cudaMemcpyHostToDevice);

			   

	cudaMemcpy(dev_patternImg, 

			   p_img, 

			   p_h * p_w * sizeof(char),

			   cudaMemcpyHostToDevice);

	

	dim3 grids(r_w, r_h);

	dim3 threads(16, 16);

	kernel<<<grids, threads>>>(dev_sourceImg, dev_patternImg, dev_result, s_h, s_w, p_h, p_w);

	

	cudaMemcpy(host_result, 

	           dev_result, 

		   r_h * r_w * sizeof(int),

		   cudaMemcpyDeviceToHost);

cudaFree(dev_sourceImg);

    cudaFree(dev_patternImg);

    cudaFree(dev_result);

return host_result;

}

__global__ void kernel (unsigned char* sourcePtr, unsigned char* patternPtr, int* resultPtr, int s_height, int s_width, int p_height, int p_width){

	//map from threadIdx/blockIdx to pixel position

	int s_x = blockIdx.x + threadIdx.x;

	int s_y = blockIdx.y + threadIdx.y;

	int p_x = threadIdx.x; // one block of threads handle 9 blocks

	int p_y = threadIdx.y;

	

	sourcePtr[ s_y*s_width+s_x ] += 1;

}