I have long-lived issue in my code with following symptoms:
-
Program sometimes hangs forever in random loop iteration but if it has successfully completed obtained results are always correct. In case of hang nvidia-smi shows constant 99% GPU load, backtrace consists of
CUDA lib calls with clock_gettime() in the innermost frame. -
Nvprof shows for repeatively calling (with the same data and args) kernels execution times differing by 1 - 3 (!) orders of magnitude (in case of successfull completing).
These symptoms are frequently observed on Fermi generation of Tesla GPUs. K20* also sometimes demonstrates this issue but more rarely.
I suspect MPI+CUDA interoperability problems - https://devtalk.nvidia.com/default/topic/621170/random-execution-times-and-freezes-with-concurent-kernels/ but actually it’s CUDA-only related issue.
I prepare cutoff version (quite ugly) of small piece of my code reproducing issue described on C2050 and X2070.
Three kernels used differ only by small code segments in getnewcell_p_iw_(). Kernels just copy from fb_cu[i] to fw_cu[j] where i and j are individually computed in each thread with quite complex arithmetic.
Launch:
./run iters_num dimx dimy dimz
where iters_num defines number of iterations in main loop, last arguments define size of main arrays (each must be even, >=6)
Normally it prints some iterations numbers and completes without errors.
On X2070, 340.58 driver, CUDA 5.0, 5.5 or 6.5:
./run 1000000 48 48 6
cudamalloc complete
iter=0
iter=1000
iter=2000
...
iter=106000
iter=107000
[here it hangs forever]
nvidia-smi shows at this moment:
+------------------------------------------------------+
| NVIDIA-SMI 340.58 Driver Version: 340.58 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla X2070 Off | 0000:03:00.0 Off | 0 |
|100% N/A P0 N/A / N/A | 67MiB / 5375MiB | 99% Default |
+-------------------------------+----------------------+----------------------+
| 1 Tesla X2070 Off | 0000:81:00.0 Off | 0 |
|100% N/A P8 N/A / N/A | 10MiB / 5375MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Compute processes: GPU Memory |
| GPU PID Process name Usage |
|=============================================================================|
| 0 4436 .../run 55MiB |
+-----------------------------------------------------------------------------+
backtrace from gdb
#0 0x00002aaaaaacba11 in clock_gettime ()
#1 0x00002aaaaacd0e46 in clock_gettime () from /lib64/librt.so.1
#2 0x00002aaaac6f0d5e in ?? () from /usr/lib64/libcuda.so.1
#3 0x00002aaaac054a03 in ?? () from /usr/lib64/libcuda.so.1
#4 0x00002aaaac035173 in ?? () from /usr/lib64/libcuda.so.1
#5 0x00002aaaac0352b1 in ?? () from /usr/lib64/libcuda.so.1
#6 0x00002aaaabfaac82 in ?? () from /usr/lib64/libcuda.so.1
#7 0x00002aaaabf799b2 in cuStreamSynchronize () from /usr/lib64/libcuda.so.1
#8 0x0000000000406fc4 in cudart::cudaApiStreamSynchronize(CUstream_st*) ()
#9 0x000000000043cecd in cudaStreamSynchronize ()
#10 0x0000000000402ccd in main (argc=5, argv=0x7fffffffe598) at main.cu:102
cuda-memcheck shows no errors.
- Launching with small number of iterations gives better chance for successful completing:
nvprof ./run 10 48 48 6
==3497== Profiling result:
Time(%) Time Calls Avg Min Max Name
49.89% 711.21ms 10 71.121ms <b>161.18us 99.873ms</b> predictor_gpu_w_up(int, double**, double**, int, int, int)
49.89% 711.20ms 10 71.120ms <b>154.49us 99.871ms</b> predictor_gpu_w_down(int, double**, double**, int, int, int)
0.11% 1.5602ms 10 156.02us 138.79us 217.43us predictor_gpu_w_left(int, double**, double**, int, int, int)
Pay attension at min and max time for two kernels - they differ in 600+ times!
This result in Visual profiler:
- If I disable concurent kernels (via CUDA_LAUNCH_BLOCKING) I NEVER have this issue and kernel timings are ALWAYS “normal”:
CUDA_LAUNCH_BLOCKING=1 nvprof ./run 10 48 48 6
==18305== Profiling result:
Time(%) Time Calls Avg Min Max Name
27.44% 1.3834ms 10 138.34us <b>137.09us 139.87us</b> predictor_gpu_w_left(int, double**, double**, int, int, int)
21.39% 1.0782ms 10 107.82us <b>106.74us 109.08us</b> predictor_gpu_w_down(int, double**, double**, int, int, int)
21.37% 1.0774ms 10 107.74us <b>106.81us 108.66us</b> predictor_gpu_w_up(int, double**, double**, int, int, int)
I’m sure it is not nvprof-relaited issue since total execution time also significally varies for concurent and serialized kernel execution.
The same behavior is on C2050, CUDA 5.0, 5,5, 6.5, 340.29 driver, SUSE 11 SP11, 2.6.32 kernel.
Unfortunately I can’t reproduce with this code issue on GTS 450 (331.20 driver, CUDA 5.5, ubuntu 12.04, 3.11 kernel), cc_30 (Quadro 410), cc_35 (K20C) devices (352.**, 340.58 drivers, CUDA 6.5, 7.0, ubuntu 10.04, 2.6.32 kernel).
So please anyone test this code on Fermi Teslas with newer drivers (35*.**) and toolkits (7.0, 7.5) and report results (I have no admin priveleges for modifying drivers on servers with X2075 and C2050 cards).
Source code files:
makefile:
CC =gcc -O0 -g3
NVCC =nvcc
NVFLAGS =-arch=sm_20 -g
run: main.o \
predictor_gpu_new_down.o predictor_gpu_new_up.o predictor_gpu_new_left.o
${NVCC} ${NVFLAGS} -o $@ $^
main.o: main.cu
${NVCC} ${NVFLAGS} -c $^
predictor_gpu_new_down.o: predictor.cu
${NVCC} ${NVFLAGS} -DPD -c $^ -o $@
predictor_gpu_new_up.o: predictor.cu
${NVCC} ${NVFLAGS} -DPU -c $^ -o $@
predictor_gpu_new_left.o: predictor.cu
${NVCC} ${NVFLAGS} -DPL -c $^ -o $@
cu.header.h:
//size for 1st dimension of 2D arrays
#define NCTOT 46
main.cu:
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include "cu.header.h"
__global__ void predictor_gpu_w_left(int N,\
double **fb_cuu, double **fw_cuu, \
int domx_cuu, int domy_cuu, int domz_cuu);
__global__ void predictor_gpu_w_up(int N,\
double **fb_cuu, double **fw_cuu, \
int domx_cuu, int domy_cuu, int domz_cuu);
__global__ void predictor_gpu_w_down(int N,\
double **fb_cuu, double **fw_cuu, \
int domx_cuu, int domy_cuu, int domz_cuu);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stdout,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(int argc, char *argv[]){
double* fb[NCTOT]; //black cells
double* fw[NCTOT]; //white cells
int dimx=atoi(argv[2]);//50;
int dimy=atoi(argv[3]);//54;
int dimz=atoi(argv[4]);//16;
int i;
int dim_size=(((dimz+2) * (dimy+2) * (dimx+2) /2) ); //size between f*[i][cell] and f*[i+1][cell]
int elems; //number of elements to proccesing in kernels
double* to_gpu_const_arr[NCTOT];
double* cu_fw; //white cells in GPU
double* cu_fb; //black cells in GPU
double **cu_fw_ind,**cu_fb_ind; //for 2D indexing: cu_f*_ind[][]
int timesteps = atoi(argv[1]);//num steps in main loop
cudaStream_t stream[3];
//allocate arrays in GPU for 2D indexing
gpuErrchk(cudaMalloc((void**)&cu_fb, dim_size * (sizeof(double) * (NCTOT))));
gpuErrchk(cudaMalloc((void**)&cu_fw, dim_size * (sizeof(double) * (NCTOT))));
gpuErrchk(cudaMalloc((void***)&cu_fw_ind,sizeof(double*) * (NCTOT)));
gpuErrchk(cudaMalloc((void***)&cu_fb_ind,sizeof(double*) * (NCTOT)));
//init first dimension indexing (cu_f*_ind[...][])***************
for (i = 0; i < (NCTOT); i++)
to_gpu_const_arr[i] = &cu_fb[(dim_size * i)];
gpuErrchk(cudaMemcpy(cu_fb_ind, to_gpu_const_arr, (NCTOT) * sizeof(double *), cudaMemcpyHostToDevice));
gpuErrchk(cudaDeviceSynchronize());
for (i = 0; i < (NCTOT); i++)
to_gpu_const_arr[i] = &cu_fw[(dim_size* i)];
gpuErrchk(cudaMemcpy(cu_fw_ind, to_gpu_const_arr, (NCTOT) * sizeof(double *), cudaMemcpyHostToDevice));
gpuErrchk(cudaDeviceSynchronize());
//***************************************************************
printf(" cudamalloc complete\n");
for (i=0; i<3; i++)
gpuErrchk(cudaStreamCreate(&(stream[i])));
//init CPU arrays by zeroes *************************************
fw[0] = (double *)malloc(dim_size * (sizeof(double) ) * (NCTOT));
fb[0] = (double *)malloc(dim_size * (sizeof(double) ) * (NCTOT));
memset(fw[0], 0, dim_size * (sizeof(double) ) * (NCTOT));
memset(fb[0], 0, dim_size * (sizeof(double) ) * (NCTOT));
//***************************************************************
//copy arrays CPU->RAM
gpuErrchk(cudaMemcpy(cu_fb,fb[0],(dim_size)* (sizeof(double) * (NCTOT)), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(cu_fw,fw[0],(dim_size)* (sizeof(double) * (NCTOT)),cudaMemcpyHostToDevice));
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaPeekAtLastError());
//main loop
for (i=0; i< timesteps; i++){
if (i % 1000 == 0) printf("iter=%d\n", i);
fflush(stdout);
elems = dimx*dimy;
predictor_gpu_w_down<<<elems/64 + 1, 64, 0, stream[1]>>>( (elems/64)*64, \
cu_fb_ind, cu_fw_ind, dimx + 2, dimy + 2, dimz + 2);
gpuErrchk(cudaPeekAtLastError());
predictor_gpu_w_up<<<elems/64 + 1, 64, 0, stream[2]>>>( (elems/64)*64, \
cu_fb_ind, cu_fw_ind, dimx + 2, dimy + 2, dimz + 2);
gpuErrchk(cudaPeekAtLastError());
elems = (dimy - 4)*(dimz - 4);
predictor_gpu_w_left<<<elems/64 + 1, 64, 0, stream[0]>>>( (elems/64)*64, \
cu_fb_ind, cu_fw_ind, dimx + 2, dimy + 2, dimz + 2);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaStreamSynchronize(stream[0]));
gpuErrchk(cudaStreamSynchronize(stream[1]));
gpuErrchk(cudaStreamSynchronize(stream[2]));
} //end of main loop
//"dirty" exit
printf("completed\n");
exit(0);
}
predictor.cu:
#include "cu.header.h"
//parameter for index calculations
#define layers 2
//next ifdefs for compilation multiple kernels from one .cu (avoiding naming conflicts)
#ifdef PD
#define predictor_gpu_w(...) predictor_gpu_w_down(__VA_ARGS__)
#define predict predict_d
#endif
#ifdef PU
#define predictor_gpu_w(...) predictor_gpu_w_up(__VA_ARGS__)
#define predict predict_u
#endif
#ifdef PL
#define predictor_gpu_w(...) predictor_gpu_w_left(__VA_ARGS__)
#define predict predict_l
#endif
class predict {
public:
//pointers to 2D arrays
double **fw_cu, **fb_cu;
//"sizes" for 2-nd dimension of 2D arrays
int domx_cu, domy_cu, domz_cu;
__device__ void predictor_gpu_w_(int N);
__device__ __noinline__ int getnewneighbour_p_iw_(int x, int y, int z, double *fn);
__device__ int getnewcell_p_iw_(int *id, int *x, int *y,int *z, double *fc);
//local arrays
double fc[NCTOT];
double fn[NCTOT];
};
//awkward initialization
__global__ void predictor_gpu_w(int N,\
double **fb_cuu, double **fw_cuu, \
int domx_cuu, int domy_cuu, int domz_cuu){
predict obj;
obj.fb_cu = fb_cuu;
obj.fw_cu = fw_cuu;
obj.domx_cu = domx_cuu;
obj.domy_cu = domy_cuu;
obj.domz_cu = domz_cuu;
obj.predictor_gpu_w_( N);
}
__device__ void predict::predictor_gpu_w_(int N) {
int id, x,y,z;
int ik;
id =blockIdx.x * blockDim.x + threadIdx.x;//thread Id
if (id < N) {
//fill fc[] from fw_cu
getnewcell_p_iw_(&id, &x, &y, &z, fc);
//fill fn[] from fb_cu
getnewneighbour_p_iw_(x, y, z, fn);
for (ik=0;ik<NCTOT; ik++) fw_cu[ik][id] = fn[ik];
}
return;
}
//some index calculations
__device__ int predict::getnewcell_p_iw_(int *id, int *x, int *y,int *z, double *fc) {
int i;
//conditional compilation with -DVAR flag, see makefile
#ifdef PD
*z = *id / ((domy_cu - 2) * (domx_cu - 2) / 2);
*y = (*id - *z * (domy_cu - 2) * (domx_cu - 2) / 2) / ((domx_cu - 2)/2);
*x = *id % ((domx_cu - 2)/2);
*x = 2 * (*x) + !((*y + *z) & 0x1);
#elif PU
*z = *id / ((domy_cu - 2) * (domx_cu - 2) / 2);
*y = (*id - *z * (domy_cu - 2) * (domx_cu - 2) / 2) / ((domx_cu - 2)/2);
*x = *id % ((domx_cu - 2)/2);
*x = 2 * (*x) + !((*y + *z) & 0x1);
*z = *z + domz_cu - 4;
#elif PL
*z = *id / ((domy_cu - 6) * (layers) / 2);
*y = (*id - *z * (domy_cu - 6) * (layers) / 2) / ((layers)/2);
*x = *id % ((layers)/2);
*x = 2 * (*x) + !((*y + *z) & 0x1);
*y = *y + 2;
*z = *z + 2;
#endif
*id = (*z + 1) * (domy_cu) * ((domx_cu)/2) + (*y + 1) * ((domx_cu)/2) + (*x + 1)/2;
for (i = 0; i < NCTOT; i++) fc[i] = fw_cu[i][*id];
return 0;
}
//some index calculations
__device__ __noinline__ int predict::getnewneighbour_p_iw_(int x, int y, int z, double *fn) {
int i;
for (i = 0; i < NCTOT; i++)
fn[i] = fb_cu[i][((z + 1) * (domy_cu) * ((domx_cu)/2) + (y + 1) * ((domx_cu)/2) + (x + 1)/2)];
return 0;
}