hello,
i would write a code of multiplication of vector x matrix
i successed to obtain a good result for example multiplication of
vector[1][4] *matrix[4][3]
with declaration of Grid and Blocks like :
dim3 dimGrid2(1,1);
dim3 dimBlock2(3,1);
the result is true
but when i would increase number of elements of matrix like
vector[1][576] *matrix[576][10]
the result is false
this is my code :
////////////////////////////////////////////////////////////////////////////////////////////////////////
global_ void MatrixMulKernel(float *Md,float *Nd, float *Pd,int Hm,int Wm,int Hn,int Wn)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
float Pvaleur = 0;
for (int i = 0; i < Wm; ++i)
{
float MdElement = Md[ty * Hm + i];
float NdElement = Nd[i * Wn + tx];
Pvaleur += MdElement * NdElement;
}
Pd[ty * Wn + tx] = Pvaleur;
}
void MatrixMulOnDevice(int Hm,int Wm,int Hn ,int Wn)
{
int sizem = HmWm * sizeof(float);
int sizen = HnWn * sizeof(float);
int sizep = Hm*Wn * sizeof(float);
// allocate arrays on host
float* Na = (float*) malloc(sizen);
float* Pa = (float*) malloc(sizep);
// allocate array on device
cudaMalloc((void **) &Md, sizem);
cudaMemcpy(Md, test_faces_cu, sizem, cudaMemcpyHostToDevice) ;//matrix M
//test_faces_cu[1][576]
for(int i=0;i<SI*SI;i++)
for(int j=0;j<NBRE;j++)
Na[i*NBRE+j]=real_eigenvectors[i][j]; // real_eigenvectors[576][10];
for(int i=0;i<SI*SI;i++)
for(int j=0;j<NBRE;j++)
printf(“b[%d][%d]=%lf\n”,i,j,real_eigenvectors[i][j]);
cudaMalloc((void **) &Nd, sizen);
cudaMemcpy(Nd, Na, sizen, cudaMemcpyHostToDevice); //matrix N
cudaMalloc((void **) &Pd, sizep); //matrix P
dim3 dimGrid(1, 1);
dim3 dimBlock(Wn,Hm);
//Call of MatrixMulKernel
MatrixMulKernel<<<dimGrid, dimBlock>>>(Md,Nd,Pd,Hm,Wm,Hn,Wn);
// copy data from device to host
cudaMemcpy(Pa, Pd, sizep, cudaMemcpyDeviceToHost);
for(int i=0; i<Hm*Wn;i++)
printf(“s[%d]=%lf\n”,i,Pa[i]);
//free memory
free(Na);
free(Pa);
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
}
//normalisation avec cuda*********/
void NormVect(int n) {
// allocate array on device
cudaMalloc((void **) &adev, n);
cudaMemcpy(adev, test_faces_cu,n*sizeof(float), cudaMemcpyHostToDevice) ;
cudaMalloc((void **) &bdev, n);
cudaMemcpy(bdev, mean_vect, n*sizeof(float), cudaMemcpyHostToDevice);
dim3 dimGrid2(1,1);
dim3 dimBlock2(n,1); //n=576
incrementArrayOnDevice<<<dimGrid2, dimBlock2>>>(adev,bdev,n);
cudaMemcpy( test_faces_cu,adev, n*sizeof(float), cudaMemcpyDeviceToHost) ;
cudaFree(adev);
cudaFree(bdev);
}
i declare
dim3 dimGrid2(1,1);
dim3 dimBlock2(10,1);
i have the vector result only zero
so do you think that i execced the size of threads ??
so if somebody have any suggestion please