dear all;
I am in trouble and i need help. I have to do matrix multiplication L>1 times of 2 matrices. One matrix is M which is small and other is big. Matrices are stored as a linear array. I am using a for loop from i=0 to L-1 and copying big matrix only once from host to device for optimization. then using a kernel to do matrix multiplication, transpose and reshape. The result of this computation is used by next kernel launch. Code is working fine till completion of for loop. But when i am copying the overall result from device to host, cudaMemcpy gives error “invalid argument”. I failed to find cause of this error and fix it. Your help is highly appreciated. For better understanding i am also giving my CUDA code. I am using CUDA 4.0 on VS2010 on windows 7. Another thing is that , I have tried a lot to find some code example about successive kernel
launches using the data that resides on device , but failed to find. please let me know if it is given in any reference.
double * BMatrix(double *A, int *D,int L,double *xInf, double *xSup, double *BinoMat)
{
int i;
double *d_A,*d_M,*d_result;
int numofblocks, blocksize=256;
double *M, *M1,*M2,*M3,*M4;//holding matrices
int rowsA, colsA, rowsM, colsM;
long int size=1;
int maxofd=-1;
cudaError_t err;
//find out total size of matrix in "size" and get maximum degree of variable from D
for(i=0;i<L;i++)
{
size*=(D[i]+1);
if(D[i]>maxofd)
maxofd=D[i];
}
// Set the device with maximum power
CUDAInitialize();
//getchar();
//Set the number of blocks
numofblocks=size/blocksize;
if ((size%blocksize)>0)
numofblocks++;
//Allocate space to store final result on host
double * h_result=(double*)malloc(size*sizeof(double));
if(h_result==NULL)
{
printf("\n Memory alllocation failed for Final Bernstein matrix on Host");
exit(EXIT_FAILURE);
}
for(i=0;i<L;i++)
{
//Get the product of UxInv, VxInv and WxInv using serial processing
M1=InverseUx(D[i],BinoMat,maxofd);
M2=InverseVx(D[i],xInf[i],xSup[i]);
M3=InverseWx(D[i],xInf[i],BinoMat,maxofd);
M4=MatMulti2(M1,D[i]+1,D[i]+1,M2,D[i]+1,D[i]+1);
free(M1);
free(M2);
M=MatMulti2(M4,(D[i]+1),(D[i]+1),M3,(D[i]+1),(D[i]+1));
free(M3);
free(M4);
//printf("\n this part is over for i=%d",i);
rowsA=D[i]+1;
colsA=size/(D[i]+1);
rowsM=D[i]+1;
colsM=D[i]+1;
//Test for matrix multiplication condition
if(rowsA!=colsM)
{
printf("\n Matrices are not product compatible");
exit(EXIT_FAILURE);
}
if (i==0) //copy the coefficient matrix A to device only once
{
//Allocate space on device for matrix A
if (cudaSuccess!= (err=cudaMalloc(&d_A, (rowsA*colsA)*sizeof(double))))
{
printf("\n Error in cudaMalloc in matrix multiplication for allocating space to matrix A on device\n");
printf("\n Error is-%s",(char*)cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//copy matrix A from host to device
if (cudaSuccess!=(err=cudaMemcpy(d_A,A,size*sizeof(double),cudaMemcpyHostToDevice)))
{
printf("\n Error in cudaMemcpy execution in matrix multiplication 2\n");
printf("\n Error is-%s",(char*)cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
//Allocate space for Matrix M
if (cudaSuccess!= (err=cudaMalloc(&d_M, (rowsM*colsM)*sizeof(double))))
{
printf("\n Error in cudaMalloc in matrix multiplication for allocating space to result matrix device\n");
printf("\n Error is-%s",(char*)cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//Copy M from host to device
if (cudaSuccess!=(err=cudaMemcpy(d_M,M,(rowsM*colsM)*sizeof(double),cudaMemcpyHostToDevice)))
{
printf("\n Error in cudaMemcpy execution in matrix multiplication new\n");
printf("\n Error is-%s",(char*)cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//Allocate space for their result
if (cudaSuccess!= (err=cudaMalloc(&d_result, (rowsA*colsM)*sizeof(double))))
{
printf("\n Error in cudaMalloc in matrix multiplication for allocating space to result matrix device\n");
printf("\n Error is-%s",(char*)cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//call the kernel to do the matrix multiplication
if(i<(L-1))
{
MatMulti2Kernel<<<numofblocks, blocksize>>>(d_M,rowsM,colsM,d_A,rowsA,colsA,d_result,blocksize, D[i+1]+1,size/(D[i+1]+1));
cudaThreadSynchronize();//block until kernel executes fully
//test for correct kernel execution
if ( cudaSuccess != (err=cudaGetLastError()) )
{
printf( "\nError in matrix multiplication kernel execution\n" );
printf( "\n-%s",(char*) cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
else
{
MatMulti2Kernel<<<numofblocks, blocksize>>>(d_M,rowsM,colsM,d_A,rowsA,colsA,d_result,blocksize, D[0]+1,size/(D[0]+1));
cudaThreadSynchronize();//block until kernel executes fully
//test for correct kernel execution
if ( cudaSuccess != (err=cudaGetLastError()) )
{
printf( "\nError in matrix multiplication kernel execution\n" );
printf( "\n-%s",(char*) cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
//Free the space on device
d_A=d_result;
cudaFree(d_M);
} //for loop end here
//BMatrix is now computed and need to be copied from device to host
//HERE IS AN ERROR-INVALID ARGUMENT
//copy matrix d_A from device to host
if (cudaSuccess!=(err=cudaMemcpy(h_result,d_A,(rowsA*colsA)*sizeof(double),cudaMemcpyDeviceToHost)))
{
printf("\n Error in cudaMemcpy for transferring final result from device to host");
printf( "\n-%s",(char*) cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaFree(d_result);
cudaFree(d_A);
return h_result;
}
Thanks in advance