Hello-
Is anyone using the CUDA runtime API and the opencv API in the same code? When I try to use both I get a compile time error in the gcc header files.
I’m using CUDA 2.0:
nvcc: NVIDIA ® Cuda compiler driver
Copyright © 2005-2007 NVIDIA Corporation
Built on Wed_Jul_16_12:14:50_PDT_2008
Cuda compilation tools, release 2.0, V0.2.1221
/Users/awagner/svn_faces/cuda/cuda_runtime_trivial_example
with an installation of opencv version 1.1.0 that I built. (Newer versions exist, but I haven’t gotten them to build properly)
on OS X 10.5.6
Using apple’s included gcc 4.0.1 :
awagner@hesse(ttys007)->g++ -v
Using built-in specs.
Target: i686-apple-darwin9
Configured with: /var/tmp/gcc/gcc-5465~16/src/configure --disable-checking -enable-werror --prefix=/usr --mandir=/share/man --enable-languages=c,objc,c++,obj-c++ --program-transform-name=/[1][^.-]*$/s/$/-4.0/ --with-gxx-include-dir=/include/c++/4.0.0 --with-slibdir=/usr/lib --build=i686-apple-darwin9 --with-arch=apple --with-tune=generic --host=i686-apple-darwin9 --target=i686-apple-darwin9
Thread model: posix
gcc version 4.0.1 (Apple Inc. build 5465)
/Users/awagner/svn_faces/cuda/cuda_runtime_trivial_example
This is a recent quad core Mac Pro Intel Xeon, with an 8800 GT in an x16 PCI 2.0 slot, with no attached graphics adapter. (The stock ATI Radeon drives my display).
Here is the build error:
[codebox]nvcc trivial_example.cu -lcv -lcxcore -lhighgui -I/usr/local/include/opencv -o trivial_example
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(55): error: identifier “__builtin_ia32_emms” is undefined
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(68): error: identifier “__builtin_ia32_vec_init_v2si” is undefined
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(111): error: identifier “__builtin_ia32_vec_ext_v2si” is undefined
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(150): error: identifier “__builtin_ia32_packsswb” is undefined
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/mmintrin.h(165): error: identifier “__builtin_ia32_packssdw” is undefined
…
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/xmmintrin.h(315): error: identifier “__builtin_ia32_movss” is undefined
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/xmmintrin.h(325): error: identifier “__builtin_ia32_cmpordss” is undefined
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/xmmintrin.h(331): error: identifier “__builtin_ia32_cmpunordss” is undefined
/usr/lib/gcc/i686-apple-darwin9/4.0.1/include/xmmintrin.h(341): error: identifier “__builtin_ia32_cmpeqps” is undefined
Error limit reached.
100 errors detected in the compilation of “/tmp/tmpxft_000149ac_00000000-4_trivial_example.cpp1.ii”.
Compilation terminated.
[/codebox]
Here is my test script. It’s a mess of flags, but it let me quickly figure nail down where the conflict was. Opencv and Cuda work fine alone, but with the combo I get the above error.
[codebox]// To compile as cuda code…
// nvcc trivial_example.cu -lcv -lcxcore -lhighgui -I/usr/local/include/opencv -o trivial_example
// To compile as pure C++, make sure USE_CUDA flag is turned off,
// ln -s trivial_example.cu trivial_example.cpp
// g++ trivial_example.cpp -lcv -lcxcore -lhighgui -I/usr/local/include/opencv -o trivial_example
#define USE_CUDA 1
#define USE_OPENCV 1
#define USE_A_CLASS 0
#define CALL_KERNEL_FROM_MEMBER_FUNCTION 0
#include “stdio.h”
#if USE_CUDA
#include “cuda.h”
#if USE_OPENCV
#include “cv.h”
#if USE_CUDA
global void trivial_kernel(int* a) {
// (*a)++;
__shared__ int a_shared;
a_shared = a[0];
a_shared++;
a[0] = a_shared;
}
#if USE_A_CLASS
class boo
{
public:
boo() {}
~boo() {}
public:
int local_func() {
#if USE_OPENCV
CvMat* A = cvCreateMat( 1, 1, CV_32FC1 );
#if (USE_CUDA && CALL_KERNEL_FROM_MEMBER_FUNCTION)
// Try calling a kernel
int data = 23;
cudaError err1;
int * deviceData;
err1 = cudaMalloc((void**) &deviceData, sizeof(int)); // runtime API device malloc
if(err1==cudaErrorMemoryAllocation) {
printf("ERROR! Ran out of Memory on the graphics card!\n");
exit(0);
}
cudaMemcpy(deviceData, &data, sizeof(int), cudaMemcpyHostToDevice);
dim3 threads(1, 1);
dim3 grid(1,1);
trivial_kernel<<< grid, threads >>>(deviceData);
// copy result from device to host
cudaMemcpy(&data, deviceData, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Original value = " << "23" << ", Incremented value = " << data << "\n";
return 5;
}
int local_var;
};
// Entry point of the C++ program.
int main(int argc, char** argv)
{
printf("Starting Main...\n");
#if USE_CUDA
// Initialization
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if(deviceCount>1) {
std::cout << "WARNING: We haven't coded for multiple devices yet; we may not have grabbed the best device!\n";
exit(-1);
}
if(deviceCount==0) {
std::cout << "ERROR: No CUDA enabled devices were detected!\n";
}
cudaSetDevice(0); // Takes the index of the device you want to attach to. Eat the overhead now instead of later when the first cuda runtime function gets called.
#if USE_A_CLASS
boo my_boo;
my_boo.local_func();
#if (USE_CUDA && !CALL_KERNEL_FROM_MEMBER_FUNCTION)
// Computation
cudaError err1;
int* deviceData;
// Ready our data on the host.
int data = 23;
err1 = cudaMalloc((void**) &deviceData, sizeof(int)); // runtime API device malloc
if(err1==cudaErrorMemoryAllocation) {
printf("ERROR! Ran out of Memory on the graphics card!\n");
exit(0);
}
cudaMemcpy(deviceData, &data, sizeof(int), cudaMemcpyHostToDevice);
// Call the kernel
dim3 threads(1, 1);
dim3 grid(1,1);
trivial_kernel<<< grid, threads >>>(deviceData);
// copy result from device to host
cudaMemcpy(&data, deviceData, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(deviceData);
std::cout << "Original value = " << "23" << ", Incremented value = " << data << "\n";
printf("Exiting Main...\n");
}
[/codebox]
Thanks!
Drew
-
cg ↩︎