cuda the launch timed out and was terminated

Hi,

I have one matrix 512x512x108 and i need do some operations with your data, and when i execute the kernel and execute one line show the message:

cuda the launch timed out and was terminated.

If i remove this line:

dadosOut[indice] = 500;

The kernel execute without problems. What the problem?

yInc = 512;

zInc = 512*512;

dadosIn[512512108]

dadosOut[512512108]

tamanho = 2

__device__ int calcIndice(int x, int y, int z, int yInc, int zInc){

	int indice = (x + (y*yInc))+(z*zInc);

	int valor = 0;

	if(indice > 0 && indice < 108*zInc){

		valor = indice;

	}

	return valor;

}

__global__ void calc(short *dadosIn, float *dadosOut, int *vZ, int tamanho, int yInc, int zInc){

	int x = blockIdx.x * tamanho + threadIdx.x;

	int y = ( blockIdx.y - (256*(blockIdx.y/256)) ) * tamanho + threadIdx.y;

	int z = (blockIdx.y/256) * tamanho + threadIdx.z;

	int indice = (x + (y*yInc))+(z*zInc);

	int gradX, gradY, gradZ;

	float mHessiana[3][3], mA[3][3], mQ[3][3], mR[3][3], mS[3][3], mStemp[3][3]; //matrizes Hessiana, A, Q, R e S do método QR

	float eValues[3], eVectors[3][3]; //Eigenvalues e Eigenvectorss

	float valor = 0;

        int cont = 0;

	int isCenterline = 0;	

	float prodV1 = 0;

	float prodV2 = 0;

	

	if(x>5 && y>5 && z>5 && x<507 && y<507 && z <103){						

		//-----------------------------------------------------------------------

		//passa o filtro gaussiano blur

		dadosOut[indice] = (dadosIn[calcIndice(x-1, y+1, z-1, yInc, zInc)] +

							2 *dadosIn[calcIndice(x, y+1, z-1, yInc, zInc)] +

							dadosIn[calcIndice(x+1, y+1, z-1, yInc, zInc)] +

							2 * dadosIn[calcIndice(x-1, y, z-1, yInc, zInc)] +

							4 * dadosIn[calcIndice(x, y, z-1, yInc, zInc)] +

							2 * dadosIn[calcIndice(x+1, y, z-1, yInc, zInc)] +

							dadosIn[calcIndice(x-1, y-1, z-1, yInc, zInc)] +

							2 *dadosIn[calcIndice(x, y-1, z-1, yInc, zInc)] +

							dadosIn[calcIndice(x+1, y-1, z-1, yInc, zInc)] +

								

							2 * dadosIn[calcIndice(x-1, y+1, z, yInc, zInc)] +

							4 * dadosIn[calcIndice(x, y+1, z, yInc, zInc)] +

							2 * dadosIn[calcIndice(x+1, y+1, z, yInc, zInc)] +

							4 * dadosIn[calcIndice(x-1, y, z, yInc, zInc)] +

							8 * dadosIn[calcIndice(x, y, z, yInc, zInc)] +

							4 * dadosIn[calcIndice(x+1, y, z, yInc, zInc)] +

							2 * dadosIn[calcIndice(x-1, y-1, z, yInc, zInc)] +

							4 * dadosIn[calcIndice(x, y-1, z, yInc, zInc)] +

							2 * dadosIn[calcIndice(x+1, y-1, z, yInc, zInc)] +							

							dadosIn[calcIndice(x-1, y+1, z+1, yInc, zInc)] +

							2 *dadosIn[calcIndice(x, y+1, z+1, yInc, zInc)] +

							dadosIn[calcIndice(x+1, y+1, z+1, yInc, zInc)] +

							2 * dadosIn[calcIndice(x-1, y, z+1, yInc, zInc)] +

							4 * dadosIn[calcIndice(x, y, z+1, yInc, zInc)] +

							2 * dadosIn[calcIndice(x+1, y, z+1, yInc, zInc)] +

							dadosIn[calcIndice(x-1, y-1, z+1, yInc, zInc)] +

							2 *dadosIn[calcIndice(x, y-1, z+1, yInc, zInc)] +

							dadosIn[calcIndice(x+1, y-1, z+1, yInc, zInc)])/64;		

		//------------------------------------------------------------------

		//calcula os gradientes

		gradX = (dadosOut[calcIndice(x-1, y-1, z, yInc, zInc)] +

				2 * dadosOut[calcIndice(x, y-1, z, yInc, zInc)] +

				dadosOut[calcIndice(x+1, y-1, z, yInc, zInc)])

				-

				(dadosOut[calcIndice(x-1, y+1, z, yInc, zInc)] +

				2 * dadosOut[calcIndice(x, y+1, z, yInc, zInc)] +

				dadosOut[calcIndice(x+1, y+1, z, yInc, zInc)]);	

		gradY = (dadosOut[calcIndice(x+1, y+1, z, yInc, zInc)] +

				2 * dadosOut[calcIndice(x+1, y, z, yInc, zInc)] +

				dadosOut[calcIndice(x+1, y-1, z, yInc, zInc)])

				-

				(dadosOut[calcIndice(x-1, y+1, z, yInc, zInc)] +

				2 * dadosOut[calcIndice(x-1, y, z, yInc, zInc)] +

				dadosOut[calcIndice(x-1, y-1, z, yInc, zInc)]);	

		gradZ = (dadosOut[calcIndice(x, y-1, z-1, yInc, zInc)] +

				2 * dadosOut[calcIndice(x, y-1, z, yInc, zInc)] +

				dadosOut[calcIndice(x, y-1, z+1, yInc, zInc)])

				-

				(dadosOut[calcIndice(x, y+1, z-1, yInc, zInc)] +

				2 * dadosOut[calcIndice(x, y+1, z, yInc, zInc)] +

				dadosOut[calcIndice(x, y+1, z+1, yInc, zInc)]);	

		vZ[0] = 3;

		//------------------------------------------------------------------------

		//calcula matriz Hessiana

		//Matriz Hessiana

		//   0   1   2

		//0 Dxx Dxy Dxz

		//1 Dyx Dyy Dyz

		//2 Dzx Dzy Dzz

		//Dxx

		mHessiana[0][0] = dadosOut[calcIndice(x+1,y,z, yInc, zInc)] - 2 * dadosOut[calcIndice(x,y,z, yInc, zInc)] +

			dadosOut[calcIndice(x-1,y,z, yInc, zInc)];

		//Dyy

		mHessiana[1][1] = dadosOut[calcIndice(x,y+1,z, yInc, zInc)] - 2 * dadosOut[calcIndice(x,y,z, yInc, zInc)] +

			dadosOut[calcIndice(x,y-1,z, yInc, zInc)];

			

		//Dzz

		mHessiana[2][2] = dadosOut[calcIndice(x,y,z+1, yInc, zInc)] - 2 * dadosOut[calcIndice(x,y,z, yInc, zInc)] +

			dadosOut[calcIndice(x,y,z-1, yInc, zInc)];			

		//Dxy e Dyx

		mHessiana[0][1] = mHessiana[1][0] = (dadosOut[calcIndice(x-1,y+1,z, yInc, zInc)] - dadosOut[calcIndice(x+1,y+1,z, yInc, zInc)] +

			dadosOut[calcIndice(x+1,y-1,z, yInc, zInc)] - dadosOut[calcIndice(x-1,y-1,z, yInc, zInc)])/4;

			

		//Dxz e Dzx

		mHessiana[0][2] = mHessiana[2][0] = (dadosOut[calcIndice(x-1,y,z+1, yInc, zInc)] - dadosOut[calcIndice(x+1,y,z+1, yInc, zInc)] +

			dadosOut[calcIndice(x+1,y,z-1, yInc, zInc)] - dadosOut[calcIndice(x-1,y,z-1, yInc, zInc)])/4;

			

		//Dyz e Dzy

		mHessiana[1][2] = mHessiana[2][1] = (dadosOut[calcIndice(x,y-1,z+1, yInc, zInc)] - dadosOut[calcIndice(x,y+1,z+1, yInc, zInc)] +

			dadosOut[calcIndice(x,y+1,z-1, yInc, zInc)] - dadosOut[calcIndice(x,y-1,z-1, yInc, zInc)])/4;

		//----------------------------------------------------------------------------

		//calcula os eigens

		//copia os dados para a Matriz A

		for (int j = 0; j <= 2; j++){

			for(int i = 0; i <=2; i++){

				mA[i][j] = mHessiana[i][j];

			}

		}

		//inicializa matriz R

		for (int i = 0; i <= 2; i++){

			for(int j = 0; j <=2; j++){

				mR[i][j] = mS[i][j] = mQ[i][j] = 0;

			}

		}

		while(fabs(mQ[0][0]) != 1 && fabs(mQ[1][1]) != 1 && fabs(mQ[2][2]) != 1 && cont < 50){

			cont++;

			//calculo das matrizes Q e R

			for(int j = 0; j <= 2; j++){

				valor = 0;

				for(int t = 0; t <= 2; t++){

					valor += mA[j][t] * mA[j][t];

				}

				mR[j][j] = sqrt(valor);

				if(mR[j][j] == 0){

					break;

				} else {

					for(int i = 0; i <= 2; i++){

						mA[j][i] = mA[j][i] / mR[j][j];

					}

				}

				for(int k = j+1; k <= 2; k++){

					valor = 0;

					for(int u = 0; u <= 2; u++){

						valor += mA[j][u] * mA[k][u];

					}

					mR[k][j] = valor;

					for(int p = 0; p <=2; p++){

						mA[k][p] = mA[k][p] - (mA[j][p] * mR[k][j]);

					}

				}

			}

			//copiando para a verdadeira matriz Q

			for (int i = 0; i <= 2; i++){

				for(int j = 0; j <=2; j++){

					mQ[i][j] = mA[i][j];

					if(cont == 1){

						mS[i][j] = mA[i][j];

					}

				}

			}

			//calculando a matriz S (caso não seja a primeira iteração)

			if(cont > 1){

				for (int i = 0; i <= 2; i++){

					for(int j = 0; j <=2; j++){

						mStemp[i][j] = mS[0][j] * mQ[i][0] + mS[1][j] * mQ[i][1] + mS[2][j] * mQ[i][2];

					}

				}

			}

			//copiando para a verdadeira S

			if(cont > 1){

				for (int i = 0; i <= 2; i++){

					for(int j = 0; j <=2; j++){

						mS[i][j] = mStemp[i][j];

					}

				}

			}

			//nova matriz A

			for (int i = 0; i <= 2; i++){

				for(int j = 0; j <=2; j++){

					mA[i][j] = mR[0][j] * mQ[i][0] + mR[1][j] * mQ[i][1] + mR[2][j] * mQ[i][2];

				}

			}

		}

		//------------------------------------------------------------------------

		//verifica os pontos que são centerline

		//copia valores para a verificação

        for(int c = 0; c <= 2; c++){

            eValues[c] = mA[c][c];

            for(int d = 0; d <= 2; d++){

                eVectors[c][d] = mS[c][d];

            }

        }

		float temp = 0;

//ordena os eigens para a avaliação do ponto

        for(int i = 0; i <= 2; i++){

            for(int j = i; j <= 2; j++){

                if(eValues[i] > eValues[j]){

                    temp = eValues[i];

                    eValues[i] = eValues[j];

                    eValues[j] = temp;

for(int d = 0; d <= 2; d++){

                        temp = eVectors[i][d];

                        eVectors[i][d] = eVectors[j][d];

                        eVectors[j][d] = temp;

                    }

                }

            }

        }

		//verificação se o ponto é centerline

		if(eValues[0] <= 0 && eValues[1] <= 0){						

            prodV1 = produtoVetores(eVectors[0][0], eVectors[0][1], eVectors[0][2], gradX, gradY, gradZ);

            prodV2 = produtoVetores(eVectors[1][0], eVectors[1][1], eVectors[1][2], gradX, gradY, gradZ);

if(prodV1 == 0 && prodV2 == 0){

                //if(eValues[1]/eValues[0] >= 0.5){					

		    dadosOut[indice] = 500;  // <<<<------ line of problem

                //}

			}

        }

	}

	

}

Launch timeouts normally occur because the kernel is taking too long to run on a GPU which has an active display. The driver will kill kernels taking more than a few seconds to complete. The reason why commenting out that line allows the kernel of complete without the timeout is because without the global memory writes, most of the kernel code will be removed by compiler optimsation, leaving you with an empty kernel.

The solution is to reduce the kernel execution time, either by doing less work per kernel call or improving the code efficiency, or some combination of both. The othe alternative is to use a dedicated compute card, which eliminates the display driver time limit altogether.

Try finishing the X window system before running your program in order to see what avidday explained, in practice.

Cheers.
John.

I am using Windows and one video card, so i dont know how finishing X or explorer for only run the application. I will try divide the kernel for execute without problems, I will post the result.

Hi,

I found that one problem is when i use two matrix dadosIn and dadosOut the kernel dont pass the lines of gaussian.

So I remove the matrix dadosOut and use one matrix dadosOut[3][3][3], but when i execute happen the same problem in the same line.

cuda the launch timed out and was terminated.

Why one simple assingment uses so time that the kernel dont can execute? Now what you suggest for solve this problem? I must divide the matrix? I dont can divide the kernel because I will need use many matrix with parameter and larger that 512x512x108.

The code with the changes that i do is below:

__device__ float produtoVetores(float v1x, float v1y, float v1z, float v2x, float v2y, float v2z){

    return v1x * v2x + v1y * v2y + v1z * v2z;

}

__device__ int calcIndice(int x, int y, int z, int yInc, int zInc){

	int indice = (x + (y*yInc))+(z*zInc);

	return indice;

}

__global__ void calc(short *dadosIn, int *vZ, int tamanho, int yInc, int zInc){

	int x = blockIdx.x * tamanho + threadIdx.x;

	int y = ( blockIdx.y - (256*(blockIdx.y/256)) ) * tamanho + threadIdx.y;

	int z = (blockIdx.y/256) * tamanho + threadIdx.z;

	int indice = (x + (y*yInc))+(z*zInc);

	int gradX, gradY, gradZ;

	float mHessiana[3][3], mA[3][3], mQ[3][3], mR[3][3], mS[3][3], mStemp[3][3]; //matrizes Hessiana, A, Q, R e S do método QR

	float eValues[3], eVectors[3][3]; //Eigenvalues e Eigenvectorss

	float valor = 0;

    int cont = 0;

	int isCenterline = 0;	

	float prodV1 = 0;

	float prodV2 = 0;

	float dadosOut[3][3][3];

	

	if(x>5 && y>5 && z>5 && x<507 && y<507 && z <103){							

		//-----------------------------------------------------------------------

		//passa o filtro gaussiano blur

		for(int i=x-1; i<=x+1; i++){

			for(int j=y-1; j<=y+1; j++){

				for(int k=z-1; k<=z+1; k++){

					dadosOut[i-x+1][j-y+1][k-z+1] = (dadosIn[calcIndice(i-1, j+1, k-1, yInc, zInc)] +

							2 *dadosIn[calcIndice(i, j+1, k-1, yInc, zInc)] +

							dadosIn[calcIndice(i+1, j+1, k-1, yInc, zInc)] +

							2 * dadosIn[calcIndice(i-1, j, k-1, yInc, zInc)] +

							4 * dadosIn[calcIndice(i, j, k-1, yInc, zInc)] +

							2 * dadosIn[calcIndice(i+1, j, k-1, yInc, zInc)] +

							dadosIn[calcIndice(i-1, j-1, k-1, yInc, zInc)] +

							2 *dadosIn[calcIndice(i, j-1, k-1, yInc, zInc)] +

							dadosIn[calcIndice(i+1, j-1, k-1, yInc, zInc)]	+

								

							2 * dadosIn[calcIndice(i-1, j+1, k, yInc, zInc)] +

							4 * dadosIn[calcIndice(i, j+1, k, yInc, zInc)] +

							2 * dadosIn[calcIndice(i+1, j+1, k, yInc, zInc)] +

							4 * dadosIn[calcIndice(i-1, j, k, yInc, zInc)] +

							8 * dadosIn[calcIndice(i, j, k, yInc, zInc)] +

							4 * dadosIn[calcIndice(i+1, j, k, yInc, zInc)] +

							2 * dadosIn[calcIndice(i-1, j-1, k, yInc, zInc)] +

							4 * dadosIn[calcIndice(i, j-1, k, yInc, zInc)] +

							2 * dadosIn[calcIndice(i+1, j-1, k, yInc, zInc)] +							

							dadosIn[calcIndice(i-1, j+1, k+1, yInc, zInc)] +

							2 *dadosIn[calcIndice(i, j+1, k+1, yInc, zInc)] +

							dadosIn[calcIndice(i+1, j+1, k+1, yInc, zInc)] +

							2 * dadosIn[calcIndice(i-1, j, k+1, yInc, zInc)] +

							4 * dadosIn[calcIndice(i, j, k+1, yInc, zInc)] +

							2 * dadosIn[calcIndice(i+1, j, k+1, yInc, zInc)] +

							dadosIn[calcIndice(i-1, j-1, k+1, yInc, zInc)] +

							2 *dadosIn[calcIndice(i, j-1, k+1, yInc, zInc)] +

							dadosIn[calcIndice(i+1, j-1, k+1, yInc, zInc)])/64;	

				}

			}

		}

		//------------------------------------------------------------------

		//calcula os gradientes

		gradX = (dadosOut[0][0][1] +

				2 * dadosOut[1][0][1] +

				dadosOut[2][0][1])

				-

				(dadosOut[0][2][1] +

				2 * dadosOut[1][2][1] +

				dadosOut[2][2][1]);	

		gradY = (dadosOut[2][2][1] +

				2 * dadosOut[2][1][1] +

				dadosOut[2][0][1])

				-

				(dadosOut[0][2][1] +

				2 * dadosOut[0][1][1] +

				dadosOut[0][0][1]);	

		gradZ = (dadosOut[1][0][0] +

				2 * dadosOut[1][0][1] +

				dadosOut[1][0][2])

				-

				(dadosOut[1][2][0] +

				2 * dadosOut[1][2][1] +

				dadosOut[1][2][2]);	

		//------------------------------------------------------------------------

		//calcula matriz Hessiana

		//Matriz Hessiana

		//   0   1   2

		//0 Dxx Dxy Dxz

		//1 Dyx Dyy Dyz

		//2 Dzx Dzy Dzz

		//Dxx

		mHessiana[0][0] = dadosOut[2][1][1] - 2 * dadosOut[1][1][1] +

			dadosOut[0][1][1];

		//Dyy

		mHessiana[1][1] = dadosOut[1][2][1] - 2 * dadosOut[1][1][1] +

			dadosOut[1][0][1];

			

		//Dzz

		mHessiana[2][2] = dadosOut[1][1][2] - 2 * dadosOut[1][1][1] +

			dadosOut[1][1][0];			

		//Dxy e Dyx

		mHessiana[0][1] = mHessiana[1][0] = (dadosOut[0][2][1] - dadosOut[2][2][1] +

			dadosOut[2][0][1] - dadosOut[0][0][1])/4;

			

		//Dxz e Dzx

		mHessiana[0][2] = mHessiana[2][0] = (dadosOut[0][1][2] - dadosOut[2][1][2] +

			dadosOut[2][1][0] - dadosOut[0][1][0])/4;

			

		//Dyz e Dzy

		mHessiana[1][2] = mHessiana[2][1] = (dadosOut[1][0][2] - dadosOut[1][2][2] +

			dadosOut[1][2][0] - dadosOut[1][0][0])/4;

		//----------------------------------------------------------------------------

		//calcula os eigens

		//copia os dados para a Matriz A

		for (int j = 0; j <= 2; j++){

			for(int i = 0; i <=2; i++){

				mA[i][j] = mHessiana[i][j];

			}

		}

		//inicializa matriz R

		for (int i = 0; i <= 2; i++){

			for(int j = 0; j <=2; j++){

				mR[i][j] = mS[i][j] = mQ[i][j] = 0;

			}

		}

		while(fabs(mQ[0][0]) != 1 && fabs(mQ[1][1]) != 1 && fabs(mQ[2][2]) != 1 && cont < 20){

			cont++;

			//calculo das matrizes Q e R

			for(int j = 0; j <= 2; j++){

				valor = 0;

				for(int t = 0; t <= 2; t++){

					valor += mA[j][t] * mA[j][t];

				}

				mR[j][j] = sqrt(valor);

				if(mR[j][j] == 0){

					break;

				} else {

					for(int i = 0; i <= 2; i++){

						mA[j][i] = mA[j][i] / mR[j][j];

					}

				}

				for(int k = j+1; k <= 2; k++){

					valor = 0;

					for(int u = 0; u <= 2; u++){

						valor += mA[j][u] * mA[k][u];

					}

					mR[k][j] = valor;

					for(int p = 0; p <=2; p++){

						mA[k][p] = mA[k][p] - (mA[j][p] * mR[k][j]);

					}

				}

			}

			//copiando para a verdadeira matriz Q

			for (int i = 0; i <= 2; i++){

				for(int j = 0; j <=2; j++){

					mQ[i][j] = mA[i][j];

					if(cont == 1){

						mS[i][j] = mA[i][j];

					}

				}

			}

			//calculando a matriz S (caso não seja a primeira iteração)

			if(cont > 1){

				for (int i = 0; i <= 2; i++){

					for(int j = 0; j <=2; j++){

						mStemp[i][j] = mS[0][j] * mQ[i][0] + mS[1][j] * mQ[i][1] + mS[2][j] * mQ[i][2];

					}

				}

			}

			//copiando para a verdadeira S

			if(cont > 1){

				for (int i = 0; i <= 2; i++){

					for(int j = 0; j <=2; j++){

						mS[i][j] = mStemp[i][j];

					}

				}

			}

			//nova matriz A

			for (int i = 0; i <= 2; i++){

				for(int j = 0; j <=2; j++){

					mA[i][j] = mR[0][j] * mQ[i][0] + mR[1][j] * mQ[i][1] + mR[2][j] * mQ[i][2];

				}

			}

		}

		//------------------------------------------------------------------------

		//verifica os pontos que são centerline

		//copia valores para a verificação

        for(int c = 0; c <= 2; c++){

            eValues[c] = mA[c][c];

            for(int d = 0; d <= 2; d++){

                eVectors[c][d] = mS[c][d];

            }

        }

		float temp = 0;

//ordena os eigens para a avaliação do ponto

        for(int i = 0; i <= 2; i++){

            for(int j = i; j <= 2; j++){

                if(eValues[i] > eValues[j]){

                    temp = eValues[i];

                    eValues[i] = eValues[j];

                    eValues[j] = temp;

for(int d = 0; d <= 2; d++){

                        temp = eVectors[i][d];

                        eVectors[i][d] = eVectors[j][d];

                        eVectors[j][d] = temp;

                    }

                }

            }

        }

		

		isCenterline = 0;

		__syncthreads();

		//verificação se o ponto é centerline

		if(eValues[0] <= 0 && eValues[1] <= 0){						

            prodV1 = produtoVetores(eVectors[0][0], eVectors[0][1], eVectors[0][2], gradX, gradY, gradZ);

            prodV2 = produtoVetores(eVectors[1][0], eVectors[1][1], eVectors[1][2], gradX, gradY, gradZ);

if(prodV1 == 0 && prodV2 == 0){

                //if(eValues[1]/eValues[0] >= 0.5){										

			  dadosIn[indice] = 500;  // <<<<------ line of problem

                //}

			}

        }

	

}

it doesn’t. But without it, all of the calculations before it are irrelevant and the compiler removes them, which greatly reduces the complexity and execution time of the kernel. If you pass -Xptxas="-v" to nvcc you should see the difference in kernel size and register usage to confirm this.

Is there any way to disable the video of the Nvidia video card and use it only for CUDA and use the video card’s onboard video in windows 7? I can put sleep the kernel for the video can be update, and after the kernel continue execute?

When i divide the kernel, i use a lot memory for vector in parameters, so i get the message “out of memory”. I try divide the kernel and the image volume, but i do many copys the host memory for device memory and vice versa, so this use a lot time that i think no is good for final time the application.

While i implements the kernels, the QR method for get eigensvalues and eigenvectors is reason for message “cuda the launch timed out and was terminated” when the loop have more than 4 iterations.

Yes, you can. Use other card for video. Also you can disable that timer in windows settings. also you can run a lot less blocks in one kernell.

Your blocks looks like independed, do not they? How many blocks do you launch?

Where i can disable that timer in windows settings?

I execute one kernel in one thread. I use 8 threads in block and grid block is 256x14080. I try use 1 and 512 thredas in block, but the same error happen.

I find this page http://www.microsoft.com/whdc/device/display/wddm_timeout.mspx, that explain how change the timeout detection for GPU. I add the registery keys:

TdrLevel = 3
TdrDelay = 12
TdrDdiDelay = 5

but the error continue…

Try to split your grid into small parts and check if it work. Do not execute all in once. Launch grid with a few blocks.