Hello,
i am trying to multiply 2 double pointer arrays declared and saved in Pinned Memory as :
int A_d;
cudaHostAlloc((void)&A_d,N*sizeof(int *),cudaHostAllocMapped);
for (i = 0; i < N; i++){
cudaHostAlloc((void**)&A_d[i],N*sizeof(int),cudaHostAllocMapped);
for (j = 0; j < N; j++){
A_d[i][j]=A[i][j];
}
}
The gpu code is:
global void kernel(int **A_d,int **B_d,int **C_d,int N)
{
int i=threadIdx.x+blockIdx.xblockDim.x;
int j=threadIdx.y+blockIdx.yblockDim.y;
if((i<N)&&(j<N)){
int sum=0;
for(int k=0;k<N;k++)
{
sum=sum+A_d[i][k]*B_d[k][j];
}
C_d[i][j]=sum;
}
}
When i have small arrays ( N=3,N=256 ) i have correct results.But when i have bigger size like N=1024 (Nrows=1024,Ncols=1024) the result is not correct.
Any ideas?
Thank you in advance!