My question is if the separate compilation of binary and library may cause the failed launch of the kernel.
See the code:
// mykernel.cuh
#ifndef __MYKERNEL__
#define __MYKERNEL__
__global__
void printData();
#endif
//mykernel.cu
#include "mykernel.cuh"
#include <stdio.h>
__global__
void printData()
{
if(threadIdx.x < 128)
{
printf("Datas: %d \n", threadIdx.x);
}
}
// main.cu
#include "mykernel.cuh"
#include <stdio.h>
int main()
{
printData<<<dim3(1,1,1),dim3(128,1,1)>>>();
cudaDeviceSynchronize();
return 0;
}
1 Compiled by cmake and make file
Compiled by the CMakeLists.txt:
project (mytest LANGUAGES CXX CUDA)
CMAKE_MINIMUM_REQUIRED(VERSION 3.10 FATAL_ERROR)
find_package(CUDA REQUIRED)
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}/mytestinstall" CACHE PATH "This is default path" FORCE)
endif()
include_directories(${CUDA_INCLUDE})
set(_TEST_cu_sources_ ./mykernel.cu)
OPTION(TESTSHARED "Separation of bin and lib" On)
if(TESTSHARED)
add_executable(testbin ./main.cu)
target_link_libraries(testbin PUBLIC libtest)
add_library(libtest SHARED ${_TEST_cu_sources_})
SET_TARGET_PROPERTIES(libtest
PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
LINKER_LANGUAGE "CUDA"
)
INSTALL(TARGETS libtest DESTINATION ${CMAKE_INSTALL_PREFIX})
else()
add_executable(testbin ./main.cu ${_TEST_cu_sources_})
endif()
SET_TARGET_PROPERTIES(testbin
PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
INSTALL_RPATH "$ORIGIN"
)
INSTALL(TARGETS testbin DESTINATION ${CMAKE_INSTALL_PREFIX})
If the TESTSHARED is set to ON, main
and libtest
will be compiled separately, and vice versa.
Output:
TESTSHARED = ON: no output, the kernel failed to run.
TESTSHARED=OFF: output is as expected, the kernel worked well.
OS: Ubuntu 20.04, cuda 12.2, sm = 75.
2 Compile manually
Compile the units by nvcc
using -c
and -dlink
.
I am unsure if -dc
is over -c
here. Since I tried a lot of times, maybe both fit here.
Then link all of them by g++
for either shared or executable.
nvcc -c -Xcompiler -fPIC mykernel.cu
nvcc -dlink -o dlinka.o -rdc=true -Xcompiler -fPIC mykernel.o
g++ -shared -fPIC -o libmykernel.so mykernel.o dlinka.o
nvcc -c -Xcompiler -fPIC main.cu
nvcc -dlink -o dlinkb.o -rdc=true -Xcompiler -fPIC main.o
g++ -o test main.o dlinkb.o libmykernel.so -L/usr/local/cuda/lib64 -lcudart -Wl,-rpath,.
Output: Method #2 works well.
3 Encapsulate contents in the main
Move the details of main to another function named execute
in mykernel.cu
.
// added the execute declaration to mykernel.cuh
void execute();
// added the execute definition to mykernel.cu
void execute()
{
printData<<<dim3(1,1,1),dim3(128,1,1)>>>();
cudaDeviceSynchronize();
}
// now main function is
int main()
{
execute();
return 0;
}
Output: this method works fine.
From the outputs above, I suspect that my cmake does not guarantee the cu executable an appropriate -dlink
or something (only library made it). Meanwhile, manual compilation also can make it.
The cuda symbols defined directly in the main.cu
may not be exported and linked between the host and device sides, and the kernel fails to launch due to wrong runtime configuration.
But this is just a guess.
Thank you if you would correct me.