cudaMemcpy gives invalid argument error when successive kernels uses data on device

dear all;
I am in trouble and i need help. I have to do matrix multiplication L>1 times of 2 matrices. One matrix is M which is small and other is big. Matrices are stored as a linear array. I am using a for loop from i=0 to L-1 and copying big matrix only once from host to device for optimization. then using a kernel to do matrix multiplication, transpose and reshape. The result of this computation is used by next kernel launch. Code is working fine till completion of for loop. But when i am copying the overall result from device to host, cudaMemcpy gives error “invalid argument”. I failed to find cause of this error and fix it. Your help is highly appreciated. For better understanding i am also giving my CUDA code. I am using CUDA 4.0 on VS2010 on windows 7. Another thing is that , I have tried a lot to find some code example about successive kernel
launches using the data that resides on device , but failed to find. please let me know if it is given in any reference.

double * BMatrix(double *A, int *D,int L,double *xInf, double *xSup, double *BinoMat)
{
int i;
double *d_A,*d_M,*d_result;
int numofblocks, blocksize=256;
double *M, *M1,*M2,*M3,*M4;//holding matrices
int rowsA, colsA, rowsM, colsM;
long int size=1;
int maxofd=-1;
cudaError_t err;

 //find out total size of matrix in "size" and get maximum degree of variable from D
for(i=0;i<L;i++)
{
	size*=(D[i]+1);
	if(D[i]>maxofd)
		maxofd=D[i];
}
// Set the device with maximum power
 CUDAInitialize();
//getchar();

 	//Set the number of blocks
			numofblocks=size/blocksize;
			if ((size%blocksize)>0)
			 	numofblocks++;
//Allocate space to store final result on host
	 	double * h_result=(double*)malloc(size*sizeof(double));
 	if(h_result==NULL)
			 {
				 printf("\n Memory alllocation failed for Final Bernstein matrix on Host");
				 exit(EXIT_FAILURE);
			 }
	
 for(i=0;i<L;i++)
{
 	//Get the product of UxInv, VxInv and WxInv using serial processing
		M1=InverseUx(D[i],BinoMat,maxofd);
		M2=InverseVx(D[i],xInf[i],xSup[i]);
		M3=InverseWx(D[i],xInf[i],BinoMat,maxofd);
		M4=MatMulti2(M1,D[i]+1,D[i]+1,M2,D[i]+1,D[i]+1);
		free(M1);
		free(M2);
		M=MatMulti2(M4,(D[i]+1),(D[i]+1),M3,(D[i]+1),(D[i]+1));
		free(M3);
		free(M4);
		//printf("\n this part is over for i=%d",i);
		rowsA=D[i]+1;
		colsA=size/(D[i]+1);
		rowsM=D[i]+1;
		colsM=D[i]+1;
	 	
		//Test for matrix multiplication condition
		 if(rowsA!=colsM)
		 {
			 printf("\n Matrices are not product compatible");
			 exit(EXIT_FAILURE);
		 }

		
			if (i==0) //copy the coefficient matrix A to device only once
			{
	 
				//Allocate space on device for matrix A
				 if (cudaSuccess!=	(err=cudaMalloc(&d_A, (rowsA*colsA)*sizeof(double))))
				 {
					 printf("\n Error in cudaMalloc in matrix multiplication for allocating space to matrix A on device\n");
					printf("\n Error is-%s",(char*)cudaGetErrorString(err)); 
					
					exit(EXIT_FAILURE);
				 } 
	 
				//copy matrix A from host to device
				if (cudaSuccess!=(err=cudaMemcpy(d_A,A,size*sizeof(double),cudaMemcpyHostToDevice)))
				{
					printf("\n Error in cudaMemcpy execution in matrix multiplication 2\n");
					printf("\n Error is-%s",(char*)cudaGetErrorString(err)); 
					exit(EXIT_FAILURE);
				}	
			}	
		 
	 	//Allocate space for Matrix M
			if (cudaSuccess!=	(err=cudaMalloc(&d_M, (rowsM*colsM)*sizeof(double))))
			{
				printf("\n Error in cudaMalloc in matrix multiplication for allocating space to result matrix device\n");
				printf("\n Error is-%s",(char*)cudaGetErrorString(err)); 
				exit(EXIT_FAILURE);
			}
			//Copy M from host to device
			if (cudaSuccess!=(err=cudaMemcpy(d_M,M,(rowsM*colsM)*sizeof(double),cudaMemcpyHostToDevice)))
			{
			 printf("\n Error in cudaMemcpy execution in matrix multiplication new\n");
			 printf("\n Error is-%s",(char*)cudaGetErrorString(err)); 
			 exit(EXIT_FAILURE);
			}
	 
	 	//Allocate space for their result
			if (cudaSuccess!=	(err=cudaMalloc(&d_result, (rowsA*colsM)*sizeof(double))))
			{
				printf("\n Error in cudaMalloc in matrix multiplication for allocating space to result matrix device\n");
				printf("\n Error is-%s",(char*)cudaGetErrorString(err)); 
				exit(EXIT_FAILURE);
			}
			
		 //call the kernel to do the matrix multiplication
			if(i<(L-1))
			{
 	MatMulti2Kernel<<<numofblocks, blocksize>>>(d_M,rowsM,colsM,d_A,rowsA,colsA,d_result,blocksize, D[i+1]+1,size/(D[i+1]+1));
			 cudaThreadSynchronize();//block until kernel executes fully
			 //test for correct kernel execution
				if ( cudaSuccess != (err=cudaGetLastError()) )
				{
					printf( "\nError in matrix multiplication kernel execution\n" );
					printf( "\n-%s",(char*) cudaGetErrorString(err));
					exit(EXIT_FAILURE);
				}
			}
			else
			{
				MatMulti2Kernel<<<numofblocks, blocksize>>>(d_M,rowsM,colsM,d_A,rowsA,colsA,d_result,blocksize, D[0]+1,size/(D[0]+1));
				cudaThreadSynchronize();//block until kernel executes fully
				//test for correct kernel execution
				if ( cudaSuccess != (err=cudaGetLastError()) )
				{
					printf( "\nError in matrix multiplication kernel execution\n" );
					printf( "\n-%s",(char*) cudaGetErrorString(err));
					exit(EXIT_FAILURE);
				}
			}

		 
				
		
//Free the space on device 
		 	d_A=d_result; 
			 cudaFree(d_M);
			

} //for loop end here
	 
 	

	//BMatrix is now computed and need to be copied from device to host
//HERE IS AN ERROR-INVALID ARGUMENT
			//copy matrix d_A from device to host
			if (cudaSuccess!=(err=cudaMemcpy(h_result,d_A,(rowsA*colsA)*sizeof(double),cudaMemcpyDeviceToHost)))
			{
			 printf("\n Error in cudaMemcpy for transferring final result from device to host");
			 printf( "\n-%s",(char*) cudaGetErrorString(err));
			 exit(EXIT_FAILURE);
			}
		
 	cudaFree(d_result);
		 cudaFree(d_A);
	 return h_result; 	
 }

Thanks in advance

Check return codes even for cudaFree() - that would point out that [font=“Courier New”]d_result[/font] is freed twice (note the [font=“Courier New”]d_A=d_result[/font] statement earlier in the code).

Also the code is leaking device memory: [font=“Courier New”]d_A[/font] is allocated on the first loop iteration, but never freed. [font=“Courier New”]d_result[/font] is allocated on every iteration, but only freed (twice) after the loop finishes.

P.S.: Please edit your post to include the code in [font=“Courier New”][code][/font]…[font=“Courier New”][/code][/font] tags - it’s a lot more readable that way.

It would also be beneficial to move the cudaMalloc() and cudaFree() call with fixed dimensions out of the loop. They are quite costly as they change the page tables on the GPU.

Thanks tera for your prompt reply. i ll definitely check it and hope ful. Thanks you.

Ya i got the error, which is in line " if (cudaSuccess!= (err=cudaMalloc(&d_result, (rowsA*colsM)*sizeof(double))))". correction is colsA instead of colsM.
Many thanks tera your suggestion moving fixed size cudaMalloc()and cudaFree() at the end saved considerable time. Thanks.