Error on Kernel launch

Hello!

Im trying to write a program that receives some data, calculate using Runge-Kutta 4 method and print to a txt file.

#define TAM_MAX 1e7

...

__device__ float fi(float x,float z, float p1,float p2,float nada){

  float k1,k2,k3,k4;

if(threadIdx.x == 0){  

	  k1 = dxdt(x , z , p1);

	  k2 = dxdt(x+k1*h/2 , z+k1*h/2 , p1+k1*h/2);

	  k3 = dxdt(x+k2*h/2 , z+k2*h/2 , p1+k2*h/2);

	  k4 = dxdt(x+k3*h , z+k3*h , p1+k3*h);	

  }	

	   

	if(threadIdx.x == 1){

	  

	  k1 = dzdt(x,z,p2);

	  k2 = dzdt(x+k1*h/2,z+k1*h/2,p2+k1*h/2);

	  k3 = dzdt(x+k2*h/2,z+k2*h/2,p2+k2*h/2);

	  k4 = dzdt(x+k3*h,z+k3*h,p2+k3*h);

	}

	

	

	if(threadIdx.x == 2){

	

	  k1 = dp1dt(x,z);

	  k2 = dp1dt(x+k1*h/2,z+k1*h/2);

	  k3 = dp1dt(x+k2*h/2,z+k2*h/2);

	  k4 = dp1dt(x+k3*h,z+k3*h);

	}

	

	if(threadIdx.x == 3){

	  

	  k1 = dp2dt(x,z);

	  k2 = dp2dt(x+k1*h/2,z+k1*h/2);

	  k3 = dp2dt(x+k2*h/2,z+k2*h/2);

	  k4 = dp2dt(x+k3*h,z+k3*h);

	}  

	return h*(k1+2*k2+2*k3+k4)/6;

}

...

//Inside my main:

  dim3 dimBlock(4,Raios);

  dim3 dimGrid(Pos0,1);

  k_iteracao<<<dimGrid,dimBlock>>>(d_inx,d_inteta,p,d_p1,d_p2,Pos0,Raios,d_X);

  printf("%s\n",cudaGetErrorString(cudaGetLastError()));

  printf("%s\n",cudaGetErrorString(cudaThreadSynchronize()));

...

  cudaMemcpy(h_X,d_X,TAM_MAX*sizeof(float),cudaMemcpyDeviceToHost);

...

__global__ void k_iteracao(float *d_inx,float *d_inteta, float nada,float *d_p1,float *d_p2,int pos, int raios ,float *d_X){

__shared__ float Y[4];

  int i = 0,posicao;

  float t = 0 , t0 = 0;

Y[0] = d_inx[blockIdx.x];

  Y[1] = 0;  

  Y[2] = d_p1[threadIdx.y];

  Y[3] = d_p2[threadIdx.y];

while((Y[0] <= largurad) && (Y[0] >= largurae) && (Y[1] <= profundidade) && (t <= tempolimite)){

  //Cada thread resolve a iteracao de acordo com a sua ID e depois espera os outros	

	Y[threadIdx.x] = Y[threadIdx.x]+fi(threadIdx.x,Y[0],Y[1],Y[2],Y[3],h);

	//  if( threadIdx.x == 1) printf("Tiro: %d - Raio: %d - x: %f, z: %f, p1: %f, p2: %f\n",blockIdx.x,threadIdx.y,Y[0],Y[1],Y[2],Y[3]); 

	

	if( threadIdx.x == 0 && threadIdx.y == 0 ){

	  posicao = blockIdx.x*(int)(TAM_MAX / pos) + threadIdx.y*(int)(TAM_MAX / (raios*pos));

	  d_X[posicao + (i*5+0)] = blockIdx.x;

	  d_X[posicao + (i*5+1)] = threadIdx.y;

	  d_X[posicao + (i*5+2)] = t;

	  d_X[posicao + (i*5+3)] = Y[0];

	  d_X[posicao + (i*5+4)] = Y[1];

	  // printf("x: %f,z: %f,p1: %f,p2: %f\n",d_X[posicao  + (i*5+0)],d_X[posicao  + (i*5+1)],d_X[posicao  + (i*5+2)],d_X[posicao  + (i*5+3)],d_X[posicao + (i*5+4)]);

	

	i++;

	}

	t = t + h;

}

  d_X[blockIdx.x*(int)(TAM_MAX / pos) + threadIdx.y*(int)(TAM_MAX / (raios*pos)) + i*5 + 0] = -1; 

}

When i execute it i receive 2 error messages after kernel execution:

"invalid configuration argument

the launch timed out and was terminated"

d_inx,d_inteta,d_p1,d_p2 are arrays with all data received.

pos = 48 and raios = 18.

Each block should do the job for only one d_inx element, and each threadIdx.y layer should calculate only one d_inteta element.

Im trying to make this code work for days. Looks like cudaMemcpy isnt copying any data from device to host.

Can someone please help me find what is wrong?

PS: Sorry for my bad english.

The error messages are telling you the kernel is never launching because the execution configuration you are providing is invalid. If I read your code correctly, you are launching 72 threads per block. You should check the register usage of that kernel, it could be that your block size is too large and there are insufficient registers for the kernel to launch.

BTW: I don’t know whether the code you posted is what you are actually compiling and running, but the argument list of your device function and the call in the kernel don’t seem to match up…

The idea is to launch each block with 4 threads on x-axis and 48 threads on y-axis like you can see here:

...

  dim3 dimBlock(4,Raios); // Raios == 18

  dim3 dimGrid(Pos0,1);// Pos0 == 48

  k_iteracao<<<dimGrid,dimBlock>>>(d_inx,d_inteta,p,d_p1,d_p2,Pos0,Raios,d_X);

...

So, each block should have 72 threads and each grid should have 48 blocks.

How do i know what is “to much” for my system?

My argument list is ok. I just didnt want to paste all my code here. Just got the interesting part.

I made some changes in my code. Now looks like my kernel is launching fine (i got “no error” on return).

BUT, it doenst copy nothing from device to host.

My new Kernel:

// #########################################

#define profundidade 1000

#define largurae 0

#define largurad 5000

#define h 1e-2

#define tempolimite 4.0

#define dt tempolimite/1000

#define PI 3.14159265

#define TAM_MAX 1e7

// #########################################

...

__global__ void k_iteracao(float *d_inx, float *d_p1,float *d_p2,int pos, int raios ,float *d_X){

  //O objetivo de cada thread eh imprimir a posicao (x,z) de cada raio a cada periodo dt de tempo

float k1,k2,k3,k4;

  int iteracao = 0;

  __shared__ float Y[4];

  float t = 0,t0 = 0;

  int i = 0;

  int posicao = blockIdx.x*(int)(TAM_MAX / pos) + blockIdx.y*(int)(TAM_MAX / (raios*pos));

  Y[0] = d_inx[blockIdx.x];

  Y[1] = 0;  

  Y[2] = d_p1[blockIdx.y];

  Y[3] = d_p2[blockIdx.y];

while((Y[0] <= largurad) && (Y[0] >= largurae) && (Y[1] <= profundidade) && (t <= tempolimite)){

	iteracao++;

	if(threadIdx.x == 0){  

	  k1 = dxdt(Y[0] , Y[1] , Y[2]);

	  k2 = dxdt(Y[0]+k1*h/2 , Y[1]+k1*h/2 , Y[2]+k1*h/2);

	  k3 = dxdt(Y[0]+k2*h/2 , Y[1]+k2*h/2 , Y[2]+k2*h/2);

	  k4 = dxdt(Y[0]+k3*h , Y[1]+k3*h , Y[2]+k3*h);	

	}	

	if(threadIdx.x == 1){

	  

	  k1 = dzdt(Y[0] , Y[1] , Y[3]);

	  k2 = dzdt(Y[0]+k1*h/2 , Y[1]+k1*h/2 , Y[3]+k1*h/2);

	  k3 = dzdt(Y[0]+k2*h/2 , Y[1]+k2*h/2, Y[3]+k2*h/2);

	  k4 = dzdt(Y[0]+k3*h , Y[1]+k3*h , Y[3]+k3*h);

	}

	if(threadIdx.x == 2){

	  

	  k1 = dp1dt(Y[0] , Y[1]);

	  k2 = dp1dt(Y[0]+k1*h/2 , Y[1]+k1*h/2);

	  k3 = dp1dt(Y[0]+k2*h/2 , Y[1]+k2*h/2);

	  k4 = dp1dt(Y[0]+k3*h , Y[1]+k3*h);

	}

	if(threadIdx.x == 3){

	  

	  k1 = dp2dt(Y[0],Y[1]);

	  k2 = dp2dt(Y[0]+k1*h/2,Y[1]+k1*h/2);

	  k3 = dp2dt(Y[0]+k2*h/2,Y[1]+k2*h/2);

	  k4 = dp2dt(Y[0]+k3*h,Y[1]+k3*h);

	}  

//If i add a "__syncthreadsds()" here,  every thread with Idx.x >0 loses data on Y[1],Y[2] and Y[3]

	

Y[threadIdx.x] += (h*(k1+2*k2+2*k3+k4)/6);

	

   if((t - t0) >= dt){

	  

	  d_X[posicao + i*5+0] = blockIdx.x;

	  d_X[posicao + i*5+1] = blockIdx.y;

	  d_X[posicao + i*5+2] = t;

	  d_X[posicao + i*5+3] = Y[0];

	  d_X[posicao + i*5+4] = Y[1];

	  i++;

	  t0 = t;

	  d_X[posicao + i*5+0] = -1; 

	}

	  t = t+ h;

  }

}

And… it works on emulation mode :-/

Compile your code with --ptxas-options="-v" and the assembler will report the register, shared memory, local memory and constant memory usage of the kernel. Appendix A of the programming guide contains the hardware resource limits (either 8192,16384 or 32768 total registers per multiprocessor).

I was referring to this device function declaration:

__device__ float fi(float x,float z, float p1,float p2,float nada)

and this device function call:

Y[threadIdx.x] = Y[threadIdx.x]+fi(threadIdx.x,Y[0],Y[1],Y[2],Y[3],h);

Got this:

ptxas info : Compiling entry function ‘Z10k_iteracaoPfS_S_iiS

ptxas info : Used 26 registers, 140+0 bytes lmem, 72+16 bytes smem, 24 bytes cmem[0], 152 bytes cmem[1]

ptxas info : Compiling entry function ‘Z3k_pPfS_S_S

ptxas info : Used 15 registers, 56+0 bytes lmem, 32+16 bytes smem, 24 bytes cmem[0], 100 bytes cmem[1]

But i dont know if it is important since i got “no error” return (but still with some problem, as you can see)

Sorry, double post…

Register usage obviously wasn’t it then…

Your shared memory usage in your “new” kernel looks wrong to my eyes. Shared memory is block scope, so every running thread in a block will be reading and writing from one copy of Y. It probably only works in emulation mode because the warp size is 1 in emulation and threads are run serially.