Recently my project(an application using CUDA and CUBLAS, etc) has encountered occasional crashes,after days of debugging & inspecting, I summarized the most suspicious part of my code and could reproduce the same errors. My summarized code goes like this:
#include <stdio.h>
#include <omp.h>
#include <mutex>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define CUDA_CALL(stmt) \
do { \
cudaError_t cuda_status = (stmt); \
if (cuda_status != cudaSuccess) { \
fprintf(stderr, "%s failed, error: %s\n", #stmt, cudaGetErrorString(cuda_status)); \
__debugbreak(); \
} \
} while (0)
#define CUBLAS_CALL(stmt) \
do { \
cublasStatus_t cublas_status = (stmt); \
if (cublas_status != CUBLAS_STATUS_SUCCESS) { \
__debugbreak(); \
} \
} while (0)
static cublasHandle_t cublas_handle_;
#define ALIGN_TO(size, align) (((size) + (align) - 1) & -(align))
void *AllocateMemory(size_t size) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock_(mutex);
void *ret;
cudaMalloc(&ret, ALIGN_TO(size, 256));
return ret;
}
static std::mutex mutex;
float arr[1111111];
int main() {
thread_local bool thread_first = 0;
thread_local float *A, *B, *C;
CUBLAS_CALL(cublasCreate_v2(&cublas_handle_));
#pragma omp parallel for schedule(dynamic)
for (int i = 0; i < 10000; ++i) {
CUDA_CALL(cudaSetDevice(0));
printf("%d\n", i);
int N = 256, M = 256, K = 4714;
if (!thread_first) {
thread_first = true;
A = (float *)AllocateMemory(N * K * sizeof(float));
B = (float *)AllocateMemory(K * M * sizeof(float));
C = (float *)AllocateMemory(N * M * sizeof(float));
}
//call SGEMM to perform C = A * B + C
for (int j = 0; j < 100; ++j) {
mutex.lock();
float alpha = 1.0f, beta = 1.0f;
CUBLAS_CALL(cublasSetStream_v2(cublas_handle_, cudaStreamPerThread));
CUBLAS_CALL(cublasSgemm_v2(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, B, M, A, K, &beta, C, M));
mutex.unlock();
}
CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
}
CUBLAS_CALL(cublasDestroy_v2(cublas_handle_));
return 0;
}
One could compile this code under windows 10 with Visual Studio 2015 + CUDA 8.0 and (optionally) via the following CMakeLists.txt(the name of the source code above is sgemm.cu):
cmake_minimum_required(VERSION 2.8)
project(CUDASGEMMTEST)
find_package(OpenMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()
find_package(CUDA)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_20)
set(CUDASRCS
sgemm.cu
)
CUDA_ADD_EXECUTABLE(cuda_test_sgemm ${CUDASRCS})
CUDA_ADD_CUBLAS_TO_TARGET(cuda_test_sgemm)
When I run the code above, the program crashes randomly with error “cudaStreamSynchronize(cudaStreamPerThread) failed, error: unspecified launch failure”
I know it is risky to use one shared cublas handle among multiple host threads, but I can’t conclude that this is the final reason since the tutorialshttp://docs.nvidia.com/cuda/cublas/index.html#thread-safety2 never said I can’t do that, just not recommended.
Meanwhile I also tried not to use the cudaStreamPerThread and for each thread I manually created one instead, and it saved the program from crashing, which makes me very confused.
So I really wonder if it is okay to make multiple host threads share one handle, and a possible reason to the crash of the above program?