CUDA execution performance problems with CMake compilation

Hello,

I have programmed something with the NSight Eclipse Edition and if i compile and run this in RELEASE mode, it is very fast, as expected.

Now I want to write my own CMakeLists.txt file to compile all the files and find all setups for the compilation automatically. The CMakeLists.txt file works (in 64bit mode), but the performance of execution is extremely (really extreme) slow compared to Eclipse compilation.

I use:

cmake -DCMAKE_BUILD_TYPE=Release ..
make

to build everything.

Now I wanted to try setting:

set(CUDA_64_BIT_DEVICE_CODE OFF)
add_definitions(-m32)
set (CMAKE_C_FLAGS -m32)
set (CMAKE_CXX_FLAGS -m32)

(CUDA_64_BIT_DEVICE_CODE to OFF) in the CMakeLists.txt file (before calling find(CUDA)), but it gives me an linking error, but I do not know how to fix this problem:

<b>Linking CXX executable my_project</b>
/usr/local/cuda/lib64/libcudart.so: could not read symbols: File in wrong format
collect2: error: ld returned 1 exit status

I think the CMake own findCUDA.cmake mixes 32bit and 64bit anywhere if I set CUDA_64_BIT_DEVICE_CODE to OFF and then crashes.

Please, can anyone help me? I’m trying to fix the performance problem since two days and have not found any useful help on the web.

Thanks for all answers!

GeForce 750Ti
[Debian stable 64-bit, CUDA 5.5, CMAKE 2.8.12, CUSP 1.7 and Thrust 0.4]

My actual CMakeLists file:

cmake_minimum_required(VERSION 2.8.9)
project(my_project)

set(CUDA_64_BIT_DEVICE_CODE OFF)
add_definitions(-m32)
set (CMAKE_C_FLAGS -m32)
set (CMAKE_CXX_FLAGS -m32)

find_package(OpenCV REQUIRED)
find_package(CUDA REQUIRED)

###############################################################################
# Find THRUST
# (from here: https://groups.google.com/forum/#!topic/thrust-users/UX7Gm4piBiU)
find_path( THRUST_INCLUDE_DIR
    HINTS ${CUDA_INCLUDE_DIRS} /usr/include/cuda /usr/local/include 
    NAMES thrust/version.h
    DOC "Thrust headers"
)
if( THRUST_INCLUDE_DIR )
    list( REMOVE_DUPLICATES THRUST_INCLUDE_DIR )
    include_directories( ${THRUST_INCLUDE_DIR} )
endif( THRUST_INCLUDE_DIR )

file( STRINGS ${THRUST_INCLUDE_DIR}/thrust/version.h
      version
      REGEX "#define THRUST_VERSION[ \t]+([0-9x]+)"
)
string( REGEX REPLACE "#define THRUST_VERSION[ \t]+" "" version $
{version} )

string( REGEX MATCH "^[0-9]" major ${version} )
string( REGEX REPLACE "^${major}00" "" version ${version} )
string( REGEX MATCH "^[0-9]" minor ${version} )
string( REGEX REPLACE "^${minor}0" "" version ${version} )
set( THRUST_VERSION "${major}.${minor}.${version}")

# Check for required components
set( THRUST_FOUND TRUE )

include( FindPackageHandleStandardArgs )
find_package_handle_standard_args( Thrust
    REQUIRED_VARS
    THRUST_INCLUDE_DIR
    VERSION_VAR
    THRUST_VERSION
)
######################################################################################

list(APPEND CUDA_NVCC_FLAGS -O3) #-gencode arch=compute_20,code=sm_20)

file (GLOB HEADER_LIST "./*.h")
file (GLOB SRC_LIST "./*.cpp")
file (GLOB CU_LIST "./*.cu")

include_directories(${PROJECT_SOURCE_DIR})

cuda_add_executable( ${PROJECT_NAME} ${THRUST_INCLUDE_DIR} ${HEADER_LIST} ${SRC_LIST} ${CU_LIST} )

target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${OpenCV_LIBS} )