The test program below creates a host thread for each GPU. Each host thread creates a cuFFT plan and executes the FFT. Most of the time, the program seems to run without error. However, sometimes it fails in a variety of ways (see below for example outputs).
Now, if I protect the whole thread with a mutex (i.e., only one thread can actually execute at a time), the program does not fail. Protecting only the cufftPlan1d call or only the cufftExecR2C call with a mutex results in the program failing. Have I misunderstood the documentation when it says:
Thread-safe API that can be called from multiple independent host threads
I’m running this program on a machine that has two Intel Xeon E5620 CPUs, 48 GB of host memory, and four Tesla C2075 cards, none of which are being used for display. The operating system is Linux (Debian 7.4), and I have CUDA version 5.5 installed. The NVIDIA driver version is 319.82. For reference, error codes 4 and 11 from cuFFT are CUFFT_INVALID_VALUE and CUFFT_INVALID_DEVICE respectively.
Does anybody know why the program is failing?
Example 1
4 CUDA device(s) found
Device 3 initialized
Device 2 initialized
Device 1 initialized
FFT execution failed for device 1, status = 11
Device 0 initialized
Device 3 deinitialized
Device 2 deinitialized
Device 0 deinitialized
Note that the device 1 thread did not terminate.
Example 2
4 CUDA device(s) found
Device 0 initialized
Device 2 initialized
Device 1 initialized
Device 3 initialized
FFT execution failed for device 3, status = 11
Device 2 deinitialized
Device 0 deinitialized
Device 1 deinitialized
Example 3
4 CUDA device(s) found
Device 1 initialized
Device 2 initialized
FFT execution failed for device 2, status = 4
Device 1 deinitialized
Device 3 initialized
Device 0 initialized
FFT execution failed for device 0, status = 4
Device 3 deinitialized
Example 4
4 CUDA device(s) found
Segmentation fault
Example 5
4 CUDA device(s) found
Device 3 initialized
Device 2 initialized
Device 3 deinitialized
Plan creation failed for device 0, status = 4
^C
In the last example, the program didn’t terminate.
Example 6
If I run multiple copies of this program at the same time using for i in {0…9}; do ./pthread_cuda & done, it fails in new and interesting ways:
4 CUDA device(s) found
4 CUDA device(s) found
4 CUDA device(s) found
4 CUDA device(s) found
4 CUDA device(s) found
4 CUDA device(s) found
pthread_cuda: pthread_mutex_lock.c:84: __pthread_mutex_lock: Assertion `mutex->__data.__owner == 0’ failed.
4 CUDA device(s) found
4 CUDA device(s) found
4 CUDA device(s) found
I don’t use a mutex in the program, so is this problem a bug in the cuFFT library?
pthread_cuda code
#include <cuda_runtime_api.h>
#include <cufft.h>
#include <malloc.h>
#include <math.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
pthread_mutex_t amutex = PTHREAD_MUTEX_INITIALIZER;
// The thread configuration structure.
typedef struct
{
int device;
pthread_t thread;
cudaError_t status;
int np;
}
config_t;
// The size of memory region.
int np = 16384;
// The function executed by each thread assigned with CUDA device.
void *thread_func(void *arg)
{
// Unpack the config structure.
config_t *config = (config_t *)arg;
int device = config->device;
int fft_in_np = config->np;
int fft_out_np = config->np / 2 + 1;
/*
int status = pthread_mutex_lock(&amutex);
if (status) {
fprintf(stderr, "Cannot lock mutex for device %d, status = %d\n",
device, status);
pthread_exit(NULL);
}
*/
// Set focus on device with the specificed index.
cudaError_t cuda_status = cudaSetDevice(device);
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot set focus to device %d, status = %d\n",
device, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}
/*
status = pthread_mutex_unlock(&amutex);
if (status) {
fprintf(stderr, "Cannot unlock mutex for device %d, status = %d\n",
device, status);
pthread_exit(NULL);
}
*/
cufftHandle r2c_plan;
cufftResult cufft_status = cufftPlan1d(&r2c_plan, fft_in_np, CUFFT_R2C, 1);
if (cufft_status != CUFFT_SUCCESS) {
fprintf(stderr, "Plan creation failed for device %d, status = %d\n",
device, cufft_status);
//config->status = cufft_status;
pthread_exit(NULL);
}
cuda_status = cudaDeviceSynchronize();
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Failed to synchronize device %d, status = %d\n",
device, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}
// Create device arrays for input and output data.
cufftReal *in_dev_data = NULL;
cufftComplex *out_dev_data = NULL;
cuda_status = cudaMalloc((void **)&in_dev_data, (fft_in_np + 2) * sizeof(cufftReal));
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot allocate CUDA FFT buffer on device %d, status = %d\n",
device, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}
cuda_status = cudaMalloc((void **)&out_dev_data, fft_out_np * sizeof(cufftComplex));
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot allocate CUDA FFT buffer on device %d, status = %d\n",
device, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}
printf("Device %d initialized\n", device);
//out_dev_data = (cufftComplex *)in_dev_data;
cufft_status = cufftExecR2C(r2c_plan, in_dev_data, out_dev_data);
if (cufft_status != CUFFT_SUCCESS) {
fprintf(stderr, "FFT execution failed for device %d, status = %d\n",
device, cufft_status);
//config->status = cuda_status;
pthread_exit(NULL);
}
cuda_status = cudaDeviceSynchronize();
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Failed to synchronize device %d, status = %d\n",
device, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}
// Dispose device buffers.
cuda_status = cudaFree(in_dev_data);
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot release input buffer on device %d, status = %d\n",
device, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}
cufft_status = cufftDestroy(r2c_plan);
if (cufft_status != CUFFT_SUCCESS) {
fprintf(stderr, "Plan destruction failed for device %d, status = %d\n",
device, cufft_status);
//config->status = cuda_status;
pthread_exit(NULL);
}
printf("Device %d deinitialized\n", device);
config->status = 0;
return NULL;
}
int main(int argc, char* argv[])
{
int ndevices = 0;
cudaError_t cuda_status = cudaGetDeviceCount(&ndevices);
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot get the cuda device count, status = %d\n",
cuda_status);
return cuda_status;
}
// Return if no cuda devices present.
printf("%d CUDA device(s) found\n", ndevices);
if (!ndevices)
return 0;
int dev_num;
cuda_status = cudaGetDevice(&dev_num);
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot get the cuda device number, status = %d\n",
cuda_status);
return cuda_status;
}
// Create workers configs. Its data will be passed as
// argument to thread_func.
config_t* configs = (config_t*)malloc(sizeof(config_t) * ndevices);
// For each CUDA device found create a separate thread
// and execute the thread_func.
for (int i = 0; i < ndevices; i++) {
config_t *config = configs + i;
config->device = i;
config->np = np;
//config->in_host = in + np * i;
int status = pthread_create(&config->thread, NULL, thread_func, config);
if (status) {
fprintf(stderr, "Cannot create thread for device %d, status = %d\n",
i, status);
return status;
}
}
// Wait for device threads completion.
// Check error status.
int status = 0;
for (int i = 0; i < ndevices; i++) {
pthread_join(configs[i].thread, NULL);
status += configs[i].status;
}
if (status)
return status;
free(configs);
return 0;
}