cudaMemcpy gives invalid argument error when successive kernels uses data on device

dhabeps · October 22, 2011, 8:59am

dear all;
I am in trouble and i need help. I have to do matrix multiplication L>1 times of 2 matrices. One matrix is M which is small and other is big. Matrices are stored as a linear array. I am using a for loop from i=0 to L-1 and copying big matrix only once from host to device for optimization. then using a kernel to do matrix multiplication, transpose and reshape. The result of this computation is used by next kernel launch. Code is working fine till completion of for loop. But when i am copying the overall result from device to host, cudaMemcpy gives error “invalid argument”. I failed to find cause of this error and fix it. Your help is highly appreciated. For better understanding i am also giving my CUDA code. I am using CUDA 4.0 on VS2010 on windows 7. Another thing is that , I have tried a lot to find some code example about successive kernel
launches using the data that resides on device , but failed to find. please let me know if it is given in any reference.

double * BMatrix(double *A, int *D,int L,double *xInf, double *xSup, double *BinoMat)
{
int i;
double *d_A,*d_M,*d_result;
int numofblocks, blocksize=256;
double *M, *M1,*M2,*M3,*M4;//holding matrices
int rowsA, colsA, rowsM, colsM;
long int size=1;
int maxofd=-1;
cudaError_t err;

 //find out total size of matrix in "size" and get maximum degree of variable from D
for(i=0;i<L;i++)
{
	size*=(D[i]+1);
	if(D[i]>maxofd)
		maxofd=D[i];
}
// Set the device with maximum power
 CUDAInitialize();
//getchar();

 	//Set the number of blocks
			numofblocks=size/blocksize;
			if ((size%blocksize)>0)
			 	numofblocks++;
//Allocate space to store final result on host
	 	double * h_result=(double*)malloc(size*sizeof(double));
 	if(h_result==NULL)
			 {
				 printf("\n Memory alllocation failed for Final Bernstein matrix on Host");
				 exit(EXIT_FAILURE);
			 }
	
 for(i=0;i<L;i++)
{
 	//Get the product of UxInv, VxInv and WxInv using serial processing
		M1=InverseUx(D[i],BinoMat,maxofd);
		M2=InverseVx(D[i],xInf[i],xSup[i]);
		M3=InverseWx(D[i],xInf[i],BinoMat,maxofd);
		M4=MatMulti2(M1,D[i]+1,D[i]+1,M2,D[i]+1,D[i]+1);
		free(M1);
		free(M2);
		M=MatMulti2(M4,(D[i]+1),(D[i]+1),M3,(D[i]+1),(D[i]+1));
		free(M3);
		free(M4);
		//printf("\n this part is over for i=%d",i);
		rowsA=D[i]+1;
		colsA=size/(D[i]+1);
		rowsM=D[i]+1;
		colsM=D[i]+1;
	 	
		//Test for matrix multiplication condition
		 if(rowsA!=colsM)
		 {
			 printf("\n Matrices are not product compatible");
			 exit(EXIT_FAILURE);
		 }

		
			if (i==0) //copy the coefficient matrix A to device only once
			{
	 
				//Allocate space on device for matrix A
				 if (cudaSuccess!=	(err=cudaMalloc(&d_A, (rowsA*colsA)*sizeof(double))))
				 {
					 printf("\n Error in cudaMalloc in matrix multiplication for allocating space to matrix A on device\n");
					printf("\n Error is-%s",(char*)cudaGetErrorString(err)); 
					
					exit(EXIT_FAILURE);
				 } 
	 
				//copy matrix A from host to device
				if (cudaSuccess!=(err=cudaMemcpy(d_A,A,size*sizeof(double),cudaMemcpyHostToDevice)))
				{
					printf("\n Error in cudaMemcpy execution in matrix multiplication 2\n");
					printf("\n Error is-%s",(char*)cudaGetErrorString(err)); 
					exit(EXIT_FAILURE);
				}	
			}	
		 
	 	//Allocate space for Matrix M
			if (cudaSuccess!=	(err=cudaMalloc(&d_M, (rowsM*colsM)*sizeof(double))))
			{
				printf("\n Error in cudaMalloc in matrix multiplication for allocating space to result matrix device\n");
				printf("\n Error is-%s",(char*)cudaGetErrorString(err)); 
				exit(EXIT_FAILURE);
			}
			//Copy M from host to device
			if (cudaSuccess!=(err=cudaMemcpy(d_M,M,(rowsM*colsM)*sizeof(double),cudaMemcpyHostToDevice)))
			{
			 printf("\n Error in cudaMemcpy execution in matrix multiplication new\n");
			 printf("\n Error is-%s",(char*)cudaGetErrorString(err)); 
			 exit(EXIT_FAILURE);
			}
	 
	 	//Allocate space for their result
			if (cudaSuccess!=	(err=cudaMalloc(&d_result, (rowsA*colsM)*sizeof(double))))
			{
				printf("\n Error in cudaMalloc in matrix multiplication for allocating space to result matrix device\n");
				printf("\n Error is-%s",(char*)cudaGetErrorString(err)); 
				exit(EXIT_FAILURE);
			}
			
		 //call the kernel to do the matrix multiplication
			if(i<(L-1))
			{
 	MatMulti2Kernel<<<numofblocks, blocksize>>>(d_M,rowsM,colsM,d_A,rowsA,colsA,d_result,blocksize, D[i+1]+1,size/(D[i+1]+1));
			 cudaThreadSynchronize();//block until kernel executes fully
			 //test for correct kernel execution
				if ( cudaSuccess != (err=cudaGetLastError()) )
				{
					printf( "\nError in matrix multiplication kernel execution\n" );
					printf( "\n-%s",(char*) cudaGetErrorString(err));
					exit(EXIT_FAILURE);
				}
			}
			else
			{
				MatMulti2Kernel<<<numofblocks, blocksize>>>(d_M,rowsM,colsM,d_A,rowsA,colsA,d_result,blocksize, D[0]+1,size/(D[0]+1));
				cudaThreadSynchronize();//block until kernel executes fully
				//test for correct kernel execution
				if ( cudaSuccess != (err=cudaGetLastError()) )
				{
					printf( "\nError in matrix multiplication kernel execution\n" );
					printf( "\n-%s",(char*) cudaGetErrorString(err));
					exit(EXIT_FAILURE);
				}
			}

		 
				
		
//Free the space on device 
		 	d_A=d_result; 
			 cudaFree(d_M);
			

} //for loop end here
	 
 	

	//BMatrix is now computed and need to be copied from device to host
//HERE IS AN ERROR-INVALID ARGUMENT
			//copy matrix d_A from device to host
			if (cudaSuccess!=(err=cudaMemcpy(h_result,d_A,(rowsA*colsA)*sizeof(double),cudaMemcpyDeviceToHost)))
			{
			 printf("\n Error in cudaMemcpy for transferring final result from device to host");
			 printf( "\n-%s",(char*) cudaGetErrorString(err));
			 exit(EXIT_FAILURE);
			}
		
 	cudaFree(d_result);
		 cudaFree(d_A);
	 return h_result; 	
 }

Thanks in advance

tera · October 22, 2011, 4:49pm

Check return codes even for cudaFree() - that would point out that [font=“Courier New”]d_result[/font] is freed twice (note the [font=“Courier New”]d_A=d_result[/font] statement earlier in the code).

Also the code is leaking device memory: [font=“Courier New”]d_A[/font] is allocated on the first loop iteration, but never freed. [font=“Courier New”]d_result[/font] is allocated on every iteration, but only freed (twice) after the loop finishes.

P.S.: Please edit your post to include the code in [font=“Courier New”][[/i]code][/font]…[font=“Courier New”][[i]/code][/font] tags - it’s a lot more readable that way.

tera · October 22, 2011, 5:08pm

It would also be beneficial to move the cudaMalloc() and cudaFree() call with fixed dimensions out of the loop. They are quite costly as they change the page tables on the GPU.

dhabeps · October 23, 2011, 8:07am

Thanks tera for your prompt reply. i ll definitely check it and hope ful. Thanks you.

dhabeps · October 23, 2011, 7:10pm

Ya i got the error, which is in line " if (cudaSuccess!= (err=cudaMalloc(&d_result, (rowsA*colsM)*sizeof(double))))". correction is colsA instead of colsM.
Many thanks tera your suggestion moving fixed size cudaMalloc()and cudaFree() at the end saved considerable time. Thanks.

Topic		Replies	Views
matrix multiplication with its transpose in cuda(cudamemcpy from device to host not working) . CUDA Programming and Performance	6	1766	October 5, 2018
cudaMemcpy: invalid argument issue CUDA Programming and Performance	2	16895	September 5, 2011
Matrix Multiplication Buggy CUDA Programming and Performance	13	5234	May 5, 2010
cudaMemcpy error "invalid argument" from in-kernel malloc'ed device mem buffer on cuda 4 CUDA Programming and Performance	8	13764	February 28, 2012
2d matrix passing values help with this code CUDA Programming and Performance	4	3205	November 10, 2010
Help with cuda 2d array CUDA Programming and Performance	6	7452	September 29, 2014
Invalid argument when calling cudaMemcpy3D CUDA Programming and Performance	6	2089	October 4, 2010
Noob question: Kernel configuration failed, invalid configuration argument CUDA Programming and Performance	7	17683	July 31, 2009
How to copy Device Struct with pointers to Host? CUDA Programming and Performance	10	8235	July 8, 2014
cudaMalloc error in big loop CUDA Programming and Performance	12	15608	May 21, 2008

cudaMemcpy gives invalid argument error when successive kernels uses data on device

Related topics