Build conflict with opencv on os x?

Hello-

Is anyone using the CUDA runtime API and the opencv API in the same code? When I try to use both I get a compile time error in the gcc header files.

I’m using CUDA 2.0:

nvcc: NVIDIA ® Cuda compiler driver

Copyright © 2005-2007 NVIDIA Corporation

Built on Wed_Jul_16_12:14:50_PDT_2008

Cuda compilation tools, release 2.0, V0.2.1221

/Users/awagner/svn_faces/cuda/cuda_runtime_trivial_example

with an installation of opencv version 1.1.0 that I built. (Newer versions exist, but I haven’t gotten them to build properly)

on OS X 10.5.6

Using apple’s included gcc 4.0.1 :

awagner@hesse(ttys007)->g++ -v

Using built-in specs.

Target: i686-apple-darwin9

Configured with: /var/tmp/gcc/gcc-5465~16/src/configure --disable-checking -enable-werror --prefix=/usr --mandir=/share/man --enable-languages=c,objc,c++,obj-c++ --program-transform-name=/^[cg][^.-]*/s//-4.0/ --with-gxx-include-dir=/include/c++/4.0.0 --with-slibdir=/usr/lib --build=i686-apple-darwin9 --with-arch=apple --with-tune=generic --host=i686-apple-darwin9 --target=i686-apple-darwin9

Thread model: posix

gcc version 4.0.1 (Apple Inc. build 5465)

/Users/awagner/svn_faces/cuda/cuda_runtime_trivial_example

This is a recent quad core Mac Pro Intel Xeon, with an 8800 GT in an x16 PCI 2.0 slot, with no attached graphics adapter. (The stock ATI Radeon drives my display).

Here is the build error:

[codebox]nvcc trivial_example.cu -lcv -lcxcore -lhighgui -I/usr/local/include/opencv -o trivial_example

/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(55): error: identifier “__builtin_ia32_emms” is undefined

/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(68): error: identifier “__builtin_ia32_vec_init_v2si” is undefined

/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(111): error: identifier “__builtin_ia32_vec_ext_v2si” is undefined

/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(150): error: identifier “__builtin_ia32_packsswb” is undefined

/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(165): error: identifier “__builtin_ia32_packssdw” is undefined

/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/xmmintrin.h(315): error: identifier “__builtin_ia32_movss” is undefined

/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/xmmintrin.h(325): error: identifier “__builtin_ia32_cmpordss” is undefined

/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/xmmintrin.h(331): error: identifier “__builtin_ia32_cmpunordss” is undefined

/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/xmmintrin.h(341): error: identifier “__builtin_ia32_cmpeqps” is undefined

Error limit reached.

100 errors detected in the compilation of “/tmp/tmpxft_000149ac_00000000-4_trivial_example.cpp1.ii”.

Compilation terminated.

[/codebox]

Here is my test script. It’s a mess of flags, but it let me quickly figure nail down where the conflict was. Opencv and Cuda work fine alone, but with the combo I get the above error.

[codebox]// To compile as cuda code…

// nvcc trivial_example.cu -lcv -lcxcore -lhighgui -I/usr/local/include/opencv -o trivial_example

// To compile as pure C++, make sure USE_CUDA flag is turned off,

// ln -s trivial_example.cu trivial_example.cpp

// g++ trivial_example.cpp -lcv -lcxcore -lhighgui -I/usr/local/include/opencv -o trivial_example

#define USE_CUDA 1

#define USE_OPENCV 1

#define USE_A_CLASS 0

#define CALL_KERNEL_FROM_MEMBER_FUNCTION 0

#include “stdio.h”

#include

#if USE_CUDA

#include “cuda.h”

#endif

#if USE_OPENCV

#include “cv.h”

#endif

#if USE_CUDA

global void trivial_kernel(int* a) {

// (*a)++;

__shared__ int a_shared;

a_shared = a[0];

a_shared++;

a[0] = a_shared;

}

#endif

#if USE_A_CLASS

class boo

{

public:

boo() {}

~boo() {}

public:

int local_func() {

#if USE_OPENCV

	CvMat* A = cvCreateMat( 1, 1, CV_32FC1 );

#endif

#if (USE_CUDA && CALL_KERNEL_FROM_MEMBER_FUNCTION)

// Try calling a kernel

int data = 23;

cudaError err1;

int * deviceData;

err1 = cudaMalloc((void**) &deviceData, sizeof(int));  // runtime API device malloc

if(err1==cudaErrorMemoryAllocation) {

	printf("ERROR! Ran out of Memory on the graphics card!\n");

	exit(0);

}	

cudaMemcpy(deviceData, &data, sizeof(int), cudaMemcpyHostToDevice);	

dim3 threads(1, 1);

dim3 grid(1,1);

trivial_kernel<<< grid, threads >>>(deviceData);

// copy result from device to host

cudaMemcpy(&data, deviceData, sizeof(int), cudaMemcpyDeviceToHost);

std::cout << "Original value = " << "23" << ", Incremented value = " << data << "\n";

#endif

return 5;

}

int local_var;

};

#endif

// Entry point of the C++ program.

int main(int argc, char** argv)

{

printf("Starting Main...\n");

#if USE_CUDA

// Initialization

int deviceCount;

cudaGetDeviceCount(&deviceCount); 

if(deviceCount>1) {

	std::cout << "WARNING: We haven't coded for multiple devices yet; we may not have grabbed the best device!\n";

	exit(-1);

}

if(deviceCount==0) {

	std::cout << "ERROR: No CUDA enabled devices were detected!\n";

}

cudaSetDevice(0); // Takes the index of the device you want to attach to.  Eat the overhead now instead of later when the first cuda runtime function gets called.

#endif

#if USE_A_CLASS

boo my_boo;

my_boo.local_func();

#endif

#if (USE_CUDA && !CALL_KERNEL_FROM_MEMBER_FUNCTION)

// Computation

cudaError err1;

int* deviceData;

// Ready our data on the host.

int data = 23;

err1 = cudaMalloc((void**) &deviceData, sizeof(int)); // runtime API device malloc

if(err1==cudaErrorMemoryAllocation) {

	printf("ERROR! Ran out of Memory on the graphics card!\n");

	exit(0);

}

cudaMemcpy(deviceData, &data, sizeof(int), cudaMemcpyHostToDevice);

// Call the kernel

dim3 threads(1, 1);

dim3 grid(1,1);

trivial_kernel<<< grid, threads >>>(deviceData);



// copy result from device to host

cudaMemcpy(&data, deviceData, sizeof(int), cudaMemcpyDeviceToHost);



cudaFree(deviceData);

std::cout << "Original value = " << "23" << ", Incremented value = " << data << "\n";

#endif

printf("Exiting Main...\n");

}

[/codebox]

Thanks!

Drew

Hello-

I found a workaround for this. I compile all of my code that makes calls to the CUDA runtime API separately from my code that makes calls to OpenCV. To make this work, you have to wrap every runtime API function and kernel you call in a C wrapper function (that has cuda runtime stuff in the body but no OpenCV stuff in the interface). If you’re goal is to have functions that take CvMats, you’ll then write another level of wrapper functions, compiled separately, that have CvMats in their interface and calls to your other wrapper functions in their body. It’s ugly and frustrating and it makes the code harder to maintain, but it works. It sounds like the subset of C++ that nvidia’s compiler supports is small enough (compared to the complexity of g++) that you’ll probably have to go through this whenever you need to use a moderately sized library with CUDA.

Cheers,
Drew