what is invalid argument error

when I use 512 threads per block I get the error:too many resources requested for launch

As per the suggestion given in the following link (https://visualization.hpc.mil/wiki/Handling_CUDA_error_messages) I reduced the number of threads.

However now I am getting a new error:
invalid argument

Can anybody help me to tackle this error.
thanks

I think that your main problem is about “invalid argument”. When you try to use with 512 threads per block, the nvcc calculate that the total of register you use in a block is more than 8912. thus, too many resources requested for launch was occured.
Why don’t you put your piece of code to anybody here can see and correct it.

I think that your main problem is about “invalid argument”. When you try to use with 512 threads per block, the nvcc calculate that the total of register you use in a block is more than 8912. thus, too many resources requested for launch was occured.
Why don’t you put your piece of code to anybody here can see and correct it.

Hi, This is my piece of code

global void derivative_bondangleGPU (const int n,const int n_angleid,const float *phi_d,float *DphiDq_d,const int *angleid_d,const float *diff_d,const float *dxdq_d,const float *dydq_d,const float *dzdq_d,const float *norm_d,float *dEdx_d)

{

//n=n_q

int idx = blockIdx.x*blockDim.x + threadIdx.x;

float vec1[3],vec2[3],Dvec1[3],Dvec2[3],dotprod=0.0;

if( idx<n ){

	

	for(int j=0;j<n_angleid;++j){

		vec1[0] = diff_d[3*j]; 

		vec1[1] = diff_d[3*j+1];

		vec1[2] = diff_d[3*j+2];

		Dvec1[0] = dxdq_d[n*angleid_d[j]+idx] - dxdq_d[n*angleid_d[j+n_angleid]+idx];

		Dvec1[1] = dydq_d[n*angleid_d[j]+idx] - dydq_d[n*angleid_d[j+n_angleid]+idx];

		Dvec1[2] = dzdq_d[n*angleid_d[j]+idx] - dzdq_d[n*angleid_d[j+n_angleid]+idx];



		dotprod = unitvec_func(3,vec1,Dvec1,norm_d[j]);

		for(int k=0;k<3;++k)

			Dvec1[k] = (Dvec1[k]-(vec1[k]*dotprod)/norm_d[j])/norm_d[j] ;



		vec2[0] = diff_d[3*j+3*n_angleid];

		vec2[1] = diff_d[3*j+1+3*n_angleid];

		vec2[2] = diff_d[3*j+2+3*n_angleid];

		Dvec2[0] = dxdq_d[n*angleid_d[j+2*n_angleid]+idx] - dxdq_d[n*angleid_d[j+n_angleid]+idx];

		Dvec2[1] = dydq_d[n*angleid_d[j+2*n_angleid]+idx] - dydq_d[n*angleid_d[j+n_angleid]+idx];

		Dvec2[2] = dzdq_d[n*angleid_d[j+2*n_angleid]+idx] - dzdq_d[n*angleid_d[j+n_angleid]+idx];

		dotprod = unitvec_func(3,vec2,Dvec2,norm_d[j+n_angleid]);

		for(int k=0;k<3;++k)

			Dvec2[k] = (Dvec2[k]-(vec2[k]*dotprod)/norm_d[j+n_angleid])/norm_d[j+n_angleid] ;

		dotprod = 0.0;

		vec1[0] = vec1[0]/norm_d[j];

		vec1[1] = vec1[1]/norm_d[j];

		vec1[2] = vec1[2]/norm_d[j];

		vec2[0] = vec2[0]/norm_d[j+n_angleid];

		vec2[1] = vec2[1]/norm_d[j+n_angleid];

		vec2[2] = vec2[2]/norm_d[j+n_angleid];

		for(int k=0;k<3;++k)

			dotprod += vec1[k]*Dvec2[k] + vec2[k]*Dvec1[k];

			

		

		DphiDq_d[idx] = -dotprod/sinf(phi_d[j]);

		dEdx_d[idx] += Ktheta*p1*DphiDq_d[idx]*( 

				expf(p1*(phi_max-phi_d[j]))/

				((1.0+expf(p1*(phi_max-phi_d[j])))*(1.0+expf(p1*(phi_max-phi_d[j]))))

			       -expf(p1*(phi_d[j]-phi_min))/

				((1.0+expf(p1*(phi_d[j]-phi_min)))*(1.0+expf(p1*(phi_d[j]-phi_min))))

			       	);

	}

}	

return;

}

device host float unitvec_func (const int n,const float *f,const float *dfdx,const float Len)

{

float result = 0.0;

for(int i=0;i<n;++i)

	result += (f[i]*dfdx[i])/Len;

return result;

}

The following are the information when I compile with --ptxas-options="-v"

ptxas info : Compiling entry function ‘Z23derivative_bondangleGPUiiPKfPfPKiS0_S0_S0_S0_S0_S1’ for ‘sm_13’

ptxas info : Used 35 registers, 28+0 bytes lmem, 80+16 bytes smem, 24 bytes cmem[0], 120 bytes cmem[1]

thanks a lot

Hi, This is my piece of code

global void derivative_bondangleGPU (const int n,const int n_angleid,const float *phi_d,float *DphiDq_d,const int *angleid_d,const float *diff_d,const float *dxdq_d,const float *dydq_d,const float *dzdq_d,const float *norm_d,float *dEdx_d)

{

//n=n_q

int idx = blockIdx.x*blockDim.x + threadIdx.x;

float vec1[3],vec2[3],Dvec1[3],Dvec2[3],dotprod=0.0;

if( idx<n ){

	

	for(int j=0;j<n_angleid;++j){

		vec1[0] = diff_d[3*j]; 

		vec1[1] = diff_d[3*j+1];

		vec1[2] = diff_d[3*j+2];

		Dvec1[0] = dxdq_d[n*angleid_d[j]+idx] - dxdq_d[n*angleid_d[j+n_angleid]+idx];

		Dvec1[1] = dydq_d[n*angleid_d[j]+idx] - dydq_d[n*angleid_d[j+n_angleid]+idx];

		Dvec1[2] = dzdq_d[n*angleid_d[j]+idx] - dzdq_d[n*angleid_d[j+n_angleid]+idx];



		dotprod = unitvec_func(3,vec1,Dvec1,norm_d[j]);

		for(int k=0;k<3;++k)

			Dvec1[k] = (Dvec1[k]-(vec1[k]*dotprod)/norm_d[j])/norm_d[j] ;



		vec2[0] = diff_d[3*j+3*n_angleid];

		vec2[1] = diff_d[3*j+1+3*n_angleid];

		vec2[2] = diff_d[3*j+2+3*n_angleid];

		Dvec2[0] = dxdq_d[n*angleid_d[j+2*n_angleid]+idx] - dxdq_d[n*angleid_d[j+n_angleid]+idx];

		Dvec2[1] = dydq_d[n*angleid_d[j+2*n_angleid]+idx] - dydq_d[n*angleid_d[j+n_angleid]+idx];

		Dvec2[2] = dzdq_d[n*angleid_d[j+2*n_angleid]+idx] - dzdq_d[n*angleid_d[j+n_angleid]+idx];

		dotprod = unitvec_func(3,vec2,Dvec2,norm_d[j+n_angleid]);

		for(int k=0;k<3;++k)

			Dvec2[k] = (Dvec2[k]-(vec2[k]*dotprod)/norm_d[j+n_angleid])/norm_d[j+n_angleid] ;

		dotprod = 0.0;

		vec1[0] = vec1[0]/norm_d[j];

		vec1[1] = vec1[1]/norm_d[j];

		vec1[2] = vec1[2]/norm_d[j];

		vec2[0] = vec2[0]/norm_d[j+n_angleid];

		vec2[1] = vec2[1]/norm_d[j+n_angleid];

		vec2[2] = vec2[2]/norm_d[j+n_angleid];

		for(int k=0;k<3;++k)

			dotprod += vec1[k]*Dvec2[k] + vec2[k]*Dvec1[k];

			

		

		DphiDq_d[idx] = -dotprod/sinf(phi_d[j]);

		dEdx_d[idx] += Ktheta*p1*DphiDq_d[idx]*( 

				expf(p1*(phi_max-phi_d[j]))/

				((1.0+expf(p1*(phi_max-phi_d[j])))*(1.0+expf(p1*(phi_max-phi_d[j]))))

			       -expf(p1*(phi_d[j]-phi_min))/

				((1.0+expf(p1*(phi_d[j]-phi_min)))*(1.0+expf(p1*(phi_d[j]-phi_min))))

			       	);

	}

}	

return;

}

device host float unitvec_func (const int n,const float *f,const float *dfdx,const float Len)

{

float result = 0.0;

for(int i=0;i<n;++i)

	result += (f[i]*dfdx[i])/Len;

return result;

}

The following are the information when I compile with --ptxas-options="-v"

ptxas info : Compiling entry function ‘Z23derivative_bondangleGPUiiPKfPfPKiS0_S0_S0_S0_S0_S1’ for ‘sm_13’

ptxas info : Used 35 registers, 28+0 bytes lmem, 80+16 bytes smem, 24 bytes cmem[0], 120 bytes cmem[1]

thanks a lot

It is very hard to find the problem in your code.
However, I have a suggestion.
First, make comment all your code (inside the kernel) and compile again, to see if something happen.
If not, make comment again for all the whole code inside kernel except first row. And recompiling.
Continue doing, until you can find out what is the cause.

how many threads and block are you using?
make sure that totally register of a block must be smaller or equal to 8192 and the size of shared memory only 16KB for each block.

It is very hard to find the problem in your code.
However, I have a suggestion.
First, make comment all your code (inside the kernel) and compile again, to see if something happen.
If not, make comment again for all the whole code inside kernel except first row. And recompiling.
Continue doing, until you can find out what is the cause.

how many threads and block are you using?
make sure that totally register of a block must be smaller or equal to 8192 and the size of shared memory only 16KB for each block.

Hi,

I think it has to do with the number of pointers that I am sending in the code; even if I comment out the whole code still it gives the same error; but if I send only a few variables (of course then this same code wont be there) the at least the error doesn’t show up. could it be due to some error in some separate function ? I have noticed some strange errors with nvcc; for. e.g., in 1 of my functions cublasAlloc was failing because I had passed an array of wrong size in a completely different function.

thanks,

Hi,

I think it has to do with the number of pointers that I am sending in the code; even if I comment out the whole code still it gives the same error; but if I send only a few variables (of course then this same code wont be there) the at least the error doesn’t show up. could it be due to some error in some separate function ? I have noticed some strange errors with nvcc; for. e.g., in 1 of my functions cublasAlloc was failing because I had passed an array of wrong size in a completely different function.

thanks,