cudaMalloc failed because of one additional .cu to compile

Hi forum,
I came across a strange error this morning, the code I have been working on return a cudaMalloc failed issue (after I add one .cu file to compile), my cudaMalloc and check error code is

float* device_matA = 0;
cudaError_t err = cudaMalloc((void**)&device_matA, M * K * sizeof(float));
printf("Error: %s\n", cudaGetErrorString(err));
if(device_matA == 0 || device_matBT == 0 || device_matC == 0) {
    printf("couldn't allocate memory\n");
    return matC;
}

and it returns

Error: unknown error
couldn't allocate memory

It seems that the code did not core dump or crush, it just did not change device_matA and let it remain 0…But on my dish usage manager I still have over 30GB free space.
Here are some details:
in the project, there are several .cu files I want to compile and link to python api via pybind

src/
--common.cu
--mat_mul_naive.cu
--mat_mul_half.cu
--mat_mul_simt.cu

In my CMakeList.txt, I compile all those .cu files as:

add_library(matrix_mul_lib_kernel STATIC src/common.cu src/matrix_mul_naive.cu src/matrix_mul_half.cu src/matrix_mul_simt.cu)

set_target_properties(matrix_mul_lib_kernel PROPERTIES 
    POSITION_INDEPENDENT_CODE ON
    CUDA_VISIBILITY_PRESET "hidden"
    CUDA_SEPARABLE_COMPILATION ON
    CUDA_ARCHITECTURES 87
)

(I will post the full CMakeList.txt in the reply)
The cudaMalloc issue will occur if I compile as the code above, but I found if i delete the last file mat_mul_simt.cu in CmakeList, the code will work again and output correct results

add_library(matrix_mul_lib_kernel STATIC src/common.cu src/matrix_mul_naive.cu src/matrix_mul_half.cu)

set_target_properties(matrix_mul_lib_kernel PROPERTIES 
    POSITION_INDEPENDENT_CODE ON
    CUDA_VISIBILITY_PRESET "hidden"
    CUDA_SEPARABLE_COMPILATION ON
    CUDA_ARCHITECTURES 87
)

Do you know how I could correctly compile all those functions? Thank you!

Best,
Chengzhe

Here is the full CMakeList.txt

cmake_minimum_required(VERSION 3.16)
project(matrix_mul_lib LANGUAGES CXX CUDA)

find_package(CUDA REQUIRED)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../3rdparty/pybind11 ./pybind11)

set(CMAKE_CXX_STANDARD 11)
set(PYBIND11_CPP_STANDARD -std=c++11)

include_directories(
    ${PYTHON_INCLUDE_DIRS}
    ${CMAKE_CURRENT_SOURCE_DIR}/../3rdparty/pybind11/include
    /usr/local/cuda/lib64
    /usr/local/cuda/include
)

include_directories(include)

add_library(matrix_mul_lib_kernel STATIC src/common.cu src/matrix_mul_naive.cu src/matrix_mul_half.cu src/matrix_mul_simt.cu)

set_target_properties(matrix_mul_lib_kernel PROPERTIES 
    POSITION_INDEPENDENT_CODE ON
    CUDA_VISIBILITY_PRESET "hidden"
    CUDA_SEPARABLE_COMPILATION ON
    CUDA_ARCHITECTURES 87
)

add_library(matrix_mul_lib MODULE src/matrix_mul.cc)

set_target_properties(matrix_mul_lib PROPERTIES 
    CXX_VISIBILITY_PRESET "hidden"
    INTERPROCEDURAL_OPTIMIZATION TRUE
    PREFIX "${PYTHON_MODULE_PREFIX}"
    SUFFIX "${PYTHON_MODULE_EXTENSION}"
)
target_link_libraries(matrix_mul_lib PRIVATE matrix_mul_lib_kernel)
target_link_libraries(matrix_mul_lib PRIVATE pybind11::module)

and the cmake version is

$ cmake --version
cmake version 3.27.4

CMake suite maintained and supported by Kitware (kitware.com/cmake).

may I have some guidance on why it is that case?
Thank you!

in mat_mul_simt.cu, I was trying to do a matrix mul, and the host function (where I cudaMalloc the global memories) is

matrix_template matrix_mul_smit_host(const matrix_template& matA, const matrix_template& matBT, matrix_template& matC, int M, int N, int K) {
    event_pair timer;
    // cudaMalloc device arrays
    float* device_matA = 0;
    float* device_matBT = 0;
    float* device_matC = 0;
    cudaMalloc((void**)&device_matA, M * K * sizeof(float));
    cudaMalloc((void**)&device_matBT, N * K * sizeof(float));
    cudaMalloc((void**)&device_matC, M * N * sizeof(float));
    if(device_matA == 0 || device_matBT == 0 || device_matC == 0) {
        printf("couldn't allocate memory\n");
        return matC;
    }
    // __half_copy
    __half* device_matA_h = 0;
    __half* device_matBT_h = 0;
    __half* device_matC_h = 0;
    cudaMalloc((void**)&device_matA_h, M * K * sizeof(__half));
    cudaMalloc((void**)&device_matBT_h, N * K * sizeof(__half));
    cudaMalloc((void**)&device_matC_h, M * N * sizeof(__half));
    if(device_matA_h == 0 || device_matBT_h == 0 || device_matC_h == 0) {
        printf("couldn't allocate memory\n");
        return matC;
    }
    // cuda mem copy
    cudaMemcpy(device_matA, matA.data(), M * K * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(device_matBT, matBT.data(), N * K * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(device_matC, matC.data(), M * N * sizeof(float), cudaMemcpyHostToDevice);

    cast_kernel_float2half<<<128, 256>>>(device_matA_h, device_matA, M * K);
    cast_kernel_float2half<<<128, 256>>>(device_matBT_h, device_matBT, N * K);
    cast_kernel_float2half<<<128, 256>>>(device_matC_h, device_matC, M * N);

    __half2* device_matA_h2 = reinterpret_cast<__half2 *>(device_matA_h); 
    __half2* device_matBT_h2 = reinterpret_cast<__half2 *>(device_matBT_h);
    __half2* device_matC_h2 = reinterpret_cast<__half2 *>(device_matC_h);

    // kernel call
    int block_size = 16 * 16;
    int grid_size = (M * N) / (128 * 128);
    start_timer(&timer);
    matrix_mul_smit_kernel_128x128<<<grid_size, block_size>>>(device_matA_h2, device_matBT_h2, device_matC_h2, M, N, K);
    float kernel_time_ms = stop_timer(&timer);
    device_matC_h = reinterpret_cast<__half *>(device_matC_h2);
    cast_kernel_half2float<<<128, 256>>>(device_matC, device_matC_h, M * N);
    cudaMemcpy(matC.data(), device_matC, M * N * sizeof(float), cudaMemcpyDeviceToHost);
    cudaFree(device_matA);
    cudaFree(device_matBT);
    cudaFree(device_matC);
    cudaFree(device_matA_h);
    cudaFree(device_matBT_h);
    cudaFree(device_matC_h);
    printf("cuda kernel <matrix_mul_smit_kernel_128x128> runtime %f ms.\n", kernel_time_ms);
    return matC;
}

I have seen odd behavior like this when compiling and linking together multiple libraries in CUDA. I have some mental models I would follow when trying to do that (for example: 1. make every effort to ensure that the CUDA runtime library only gets linked once in the entire build process, and 2. especially in the case of device linking, try to make sure that each device link step has no “overlap” with any other device link phase), but I’m not an expert in CMake, and all the CMake machinery here gets in the way of my understanding of what is going on. It looks to me like you are building at least two libraries, but I’m not certain of that.

I’m generally not much help with CMake, and can’t offer any useful suggestions when CMake is hiding what is going on, behind its own language.

1 Like

Hi Robert,

You are correct. It turns out that the reason is I linked too many .cu files to the same target, according to your suggestion and the stackoverflow link c++ - How to properly link libraries with cmake? - Stack Overflow I modified my CMakeList.txt and everything works now (lol although my simt cuda mat mul gives the wrong results, but it should be some kernel logic related issue, and the stange cudaMalloc error is confirmed to be fixed)
The modified version:

cmake_minimum_required(VERSION 3.16)
project(matrix_mul_lib LANGUAGES CXX CUDA)

find_package(CUDA REQUIRED)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../3rdparty/pybind11 ./pybind11)

set(CMAKE_CXX_STANDARD 11)
set(PYBIND11_CPP_STANDARD -std=c++11)

include_directories(
    ${PYTHON_INCLUDE_DIRS}
    ${CMAKE_CURRENT_SOURCE_DIR}/../3rdparty/pybind11/include
    /usr/local/cuda/lib64
    /usr/local/cuda/include
)

include_directories(include)

# common helper functions
add_library(matrix_mul_lib_common_helpers STATIC src/common.cu)
set_target_properties(matrix_mul_lib_common_helpers PROPERTIES 
    POSITION_INDEPENDENT_CODE ON
    CUDA_VISIBILITY_PRESET "hidden"
    CUDA_SEPARABLE_COMPILATION ON
    CUDA_ARCHITECTURES 87
)
# naive version
add_library(matrix_mul_lib_naive STATIC src/matrix_mul_naive.cu)
set_target_properties(matrix_mul_lib_naive PROPERTIES 
    POSITION_INDEPENDENT_CODE ON
    CUDA_VISIBILITY_PRESET "hidden"
    CUDA_SEPARABLE_COMPILATION ON
    CUDA_ARCHITECTURES 87
)
target_link_libraries(matrix_mul_lib_naive PRIVATE matrix_mul_lib_common_helpers)
# half version
add_library(matrix_mul_lib_half STATIC src/matrix_mul_half.cu)
set_target_properties(matrix_mul_lib_half PROPERTIES 
    POSITION_INDEPENDENT_CODE ON
    CUDA_VISIBILITY_PRESET "hidden"
    CUDA_SEPARABLE_COMPILATION ON
    CUDA_ARCHITECTURES 87
)
target_link_libraries(matrix_mul_lib_half PRIVATE matrix_mul_lib_common_helpers)
# SIMT version
add_library(matrix_mul_lib_simt STATIC src/matrix_mul_simt.cu)
set_target_properties(matrix_mul_lib_simt PROPERTIES 
    POSITION_INDEPENDENT_CODE ON
    CUDA_VISIBILITY_PRESET "hidden"
    CUDA_SEPARABLE_COMPILATION ON
    CUDA_ARCHITECTURES 87
)
target_link_libraries(matrix_mul_lib_simt PRIVATE matrix_mul_lib_common_helpers)

add_library(matrix_mul_lib MODULE src/matrix_mul.cc)

set_target_properties(matrix_mul_lib PROPERTIES 
    CXX_VISIBILITY_PRESET "hidden"
    INTERPROCEDURAL_OPTIMIZATION TRUE
    PREFIX "${PYTHON_MODULE_PREFIX}"
    SUFFIX "${PYTHON_MODULE_EXTENSION}"
    CUDA_ARCHITECTURES 87
)
target_link_libraries(matrix_mul_lib PRIVATE matrix_mul_lib_naive)
target_link_libraries(matrix_mul_lib PRIVATE matrix_mul_lib_half)
target_link_libraries(matrix_mul_lib PRIVATE matrix_mul_lib_simt)
target_link_libraries(matrix_mul_lib PRIVATE pybind11::module)

Thank you so much! I believe your mental models are really useful for my future CMake practice!
Best,
Chengzhe

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.