Hi, I want to compute matrix-vector multiplication inside for loop by changing the vector data every time and writing output into different memory locations. My code is given below
__global__ void MatVectMultiplication(float *device_Mat, float *device_Vect,int matRowSize, int vlength, float *device_ResVect)
{
int tidx = blockIdx.x*blockDim.x + threadIdx.x;
int tidy = blockIdx.y*blockDim.y + threadIdx.y;
int tindex=tidx+gridDim.x*BLOCKSIZE*tidy;
if(tindex<matRowSize)
{
int i;int m=tindex*vlength;
device_ResVect[tindex]=0.00;
for(i=0;i<vlength;i++)
{
device_ResVect[tindex] += device_Mat[m+i]*device_Vect[i];
}
}
__syncthreads();
}//end of MatVect device function
Kernel Call :
dim3 blockSize(16, 16);
dim3 gridSize(1, 1);
for (int i_l = 0; i_l <= 3; i_l += iste)
{
MatVectMultiplication<<<gridSize,blockSize>>>(d_B, &d_A[i_l], 256, 256, &d_C[i_l*256], i_l);
cudaErrCheck(cudaThreadSynchronize());
}
cudaErrCheck(cudaMemcpy(C_lp, d_C, 256 * 3 * sizeof(float), cudaMemcpyDeviceToHost));
Only the first 256 values are matching with the CPU function results, the remaining all GPU outputs are coming as zeros.
Can anyone give some clues?