cuBLAS SGEMM randomly crashes when running multiple host threads sharing one cuBLAS handle.

Recently my project(an application using CUDA and CUBLAS, etc) has encountered occasional crashes,after days of debugging & inspecting, I summarized the most suspicious part of my code and could reproduce the same errors. My summarized code goes like this:

#include <stdio.h>
#include <omp.h>
#include <mutex>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#define CUDA_CALL(stmt)	\
	do { \
		cudaError_t cuda_status = (stmt); \
		if (cuda_status != cudaSuccess) { \
			fprintf(stderr, "%s failed, error: %s\n", #stmt, cudaGetErrorString(cuda_status)); \
			__debugbreak(); \
		} \
	} while (0)

#define CUBLAS_CALL(stmt) \
	do { \
		cublasStatus_t cublas_status = (stmt); \
		if (cublas_status != CUBLAS_STATUS_SUCCESS) { \
			__debugbreak(); \
		} \
	} while (0)

static cublasHandle_t cublas_handle_;

#define ALIGN_TO(size, align) (((size) + (align) - 1) & -(align))
void *AllocateMemory(size_t size) {
	static std::mutex mutex;
	std::lock_guard<std::mutex> lock_(mutex);
	void *ret;
	cudaMalloc(&ret, ALIGN_TO(size, 256));
	return ret;
}

static std::mutex mutex;
float arr[1111111];
int main() {
	thread_local bool thread_first = 0;
	thread_local float *A, *B, *C;
	CUBLAS_CALL(cublasCreate_v2(&cublas_handle_));
#pragma omp parallel for schedule(dynamic) 
	for (int i = 0; i < 10000; ++i) {
		CUDA_CALL(cudaSetDevice(0));
		printf("%d\n", i);
		int N = 256, M = 256, K = 4714;
		if (!thread_first) {
			thread_first = true;
			A = (float *)AllocateMemory(N * K * sizeof(float));
			B = (float *)AllocateMemory(K * M * sizeof(float));
			C = (float *)AllocateMemory(N * M * sizeof(float));
		}
		//call SGEMM to perform C = A * B + C
		for (int j = 0; j < 100; ++j) {
			mutex.lock();
			float alpha = 1.0f, beta = 1.0f;
			CUBLAS_CALL(cublasSetStream_v2(cublas_handle_, cudaStreamPerThread));
			CUBLAS_CALL(cublasSgemm_v2(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, B, M, A, K, &beta, C, M));
			mutex.unlock();
		}
		CUDA_CALL(cudaStreamSynchronize(cudaStreamPerThread));
	}
	CUBLAS_CALL(cublasDestroy_v2(cublas_handle_));
	return 0;
}

One could compile this code under windows 10 with Visual Studio 2015 + CUDA 8.0 and (optionally) via the following CMakeLists.txt(the name of the source code above is sgemm.cu):

cmake_minimum_required(VERSION 2.8)
project(CUDASGEMMTEST)
find_package(OpenMP)
if (OPENMP_FOUND)
    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()
find_package(CUDA)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_20)
set(CUDASRCS
	sgemm.cu 
)
CUDA_ADD_EXECUTABLE(cuda_test_sgemm ${CUDASRCS})
CUDA_ADD_CUBLAS_TO_TARGET(cuda_test_sgemm)

When I run the code above, the program crashes randomly with error “cudaStreamSynchronize(cudaStreamPerThread) failed, error: unspecified launch failure”

I know it is risky to use one shared cublas handle among multiple host threads, but I can’t conclude that this is the final reason since the tutorialshttp://docs.nvidia.com/cuda/cublas/index.html#thread-safety2 never said I can’t do that, just not recommended.

Meanwhile I also tried not to use the cudaStreamPerThread and for each thread I manually created one instead, and it saved the program from crashing, which makes me very confused.

So I really wonder if it is okay to make multiple host threads share one handle, and a possible reason to the crash of the above program?

Forgot to say, my GPU device is NVIDIA GeForce GTX 1070, and an example output of the above program looks like:

3
1
2
0
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
cudaStreamSynchronize(cudaStreamPerThread) failed, error: unspecified launch failure

Thanks for helping!

Normally if you are intending to use the per-thread methodology for the default stream, I would expect that you would compile with the –default-stream per-thread compiler switch. But I don’t see it in your cmake setup.

Is that a part of your setup?
How many OMP threads are you running? (looks like 4)
Have you modified the WDDM TDR timeout on your machine at all?

When I run your code on a K20X on linux, it runs for the full 10000 iterations without runtime error. So I’m suspicious that this may just be a WDDM TDR timeout.

Thanks for response and yes, I used 4 CPU thread in OMP. I also tried 2 and it will crash too.

I switched to linux and ran the same code, the program finished without any runtime errors just like your result. The problem is when I switched back to Windows, I changed WDDM TDR from 2 to 10 and rebooted, the program above also crash except it stucks for about 5x longer before going down. When I closed WDDM TDR and ran it again, the system stopped to responding for ~10 minutes before I physically reboot my machine(it tooks only ~2.5minutes to finish when I run it on linux). Are there some other problems in the program?

I also tried to add ‘–default-stream per-thread’ right append to the “set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_20)” line in che CMakeLists, it didn’t show any difference.

After some slight changes, now I can reproduce the same crash under my Ubuntu 16.04 -___-
Here are the corresponding code and CMakeLists:

#include <stdio.h>
#include <omp.h>
#include <mutex>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#ifdef _MSC_VER
#define DEBUG_BREAK()	__debugbreak()
#else
#include <signal.h>
#define DEBUG_BREAK()	raise(SIGTRAP)
#endif

#define CUDA_CALL(stmt)	\
	do { \
		cudaError_t cuda_status = (stmt); \
		if (cuda_status != cudaSuccess) { \
			fprintf(stderr, "%s failed, error: %s\n", #stmt, cudaGetErrorString(cuda_status)); \
			DEBUG_BREAK(); \
		} \
	} while (0)

#define CUBLAS_CALL(stmt) \
	do { \
		cublasStatus_t cublas_status = (stmt); \
		if (cublas_status != CUBLAS_STATUS_SUCCESS) { \
			DEBUG_BREAK(); \
		} \
	} while (0)
	
static cublasHandle_t cublas_handle_;
static std::mutex cublas_lock;
int main() {
	CUBLAS_CALL(cublasCreate_v2(&cublas_handle_));
	omp_set_num_threads(2);
#pragma omp parallel for schedule(dynamic) 
	for (int i = 0; i < 10000; ++i) {
		CUDA_CALL(cudaSetDevice(0));
		printf("%d\n", i);
		int N = 80 + rand() % 50, M = 3120, K = 3120;
 		float *A, *B, *C;
		cudaMalloc(&A, N * K * sizeof(float));
		cudaMalloc(&B, K * M * sizeof(float));
		cudaMalloc(&C, N * M * sizeof(float));
		//call SGEMM to perform C = A * B + C
		for (int j = 0; j < 3; ++j) {
			std::lock_guard<std::mutex> lock_(cublas_lock);
			float alpha = 1.0f, beta = 1.0f;
			CUBLAS_CALL(cublasSetStream_v2(cublas_handle_, cudaStreamPerThread));
			CUBLAS_CALL(cublasSgemm_v2(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, B, M, A, K, &beta, C, M));
		}
		cudaFree((void *)A);
		cudaFree((void *)B);
		cudaFree((void *)C);
	}
	CUBLAS_CALL(cublasDestroy_v2(cublas_handle_));
	return 0;
}
cmake_minimum_required(VERSION 2.8)

project(CUDASGEMMTEST)

find_package(OpenMP)
if (OPENMP_FOUND)
    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()
find_package(CUDA)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30 --default-stream=per-thread --std=c++11)
set(CUDASRCS
	sgemm.cu 
)
CUDA_ADD_EXECUTABLE(cuda_test_sgemm ${CUDASRCS})
CUDA_ADD_CUBLAS_TO_TARGET(cuda_test_sgemm)

One possible reason of previous error is that card I choosed to run CUDA application is also running for display output, which may cause random errors due to behavior of WDDM.
But this time I compiled my code under linux and turned off X before running it, and encountered the same problems.

I ran your code on 2 separate systems, one using CUDA 8 on linux, the other using CUDA 9.1 on linux, and in both cases it ran to 10000 iterations without throwing an error.