Hello,
I have written a simple code to do floating point matrix multiplication on CPU and GPU and compare the results. The problem is copying matrix into the device. Here is the code:
#include <math.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>
int main()
{
int Dim=5;
int test=0;
int i=0,j=0,k=0,c=0,d=0;
float Matrix1[Dim][Dim], Matrix2[Dim][Dim],Result_ACC[Dim][Dim],Result_OMP[Dim][Dim],Diff=0,sum=0;
double tstart, tstop;
for (i=0;i<Dim;i++){
for(j=0;j<Dim;j++){
Result_ACC[i][j]=0;
Result_OMP[i][j]=0;
Matrix1[i][j]=((float)(rand()%50)/15);
Matrix2[i][j]=((float)(rand()%70)/17);
}
}
for (i=0; i<Dim; i++){
for (c = 0; c < Dim; c++) {
printf("Result_ACC[%d][%d]= %G \n", i,c, Result_ACC[i][c]);
}
}
printf ("Going to GPU.test is %d \n\n",test);
struct timeval mytime1;
gettimeofday(&mytime1,(struct timezone*)0);
tstart = (double)(mytime1.tv_sec + mytime1.tv_usec*1.0e-6);
#pragma acc kernels pcopyout(Result_ACC[Dim][Dim],test) copyin(Dim,sum,Matrix1[Dim][Dim], Matrix2[Dim][Dim],c,d,k)
for (c = 0; c < Dim; c++) {
for (d = 0; d < Dim; d++) {
for (k = 0; k < Dim; k++) {
sum = sum + Matrix1[c][k]*Matrix2[k][d];
test=10;
}
Result_ACC[c][d] = sum;
sum = 0;
}
}
printf ("Back from GPU.test is %d \n\n",test);
for (i=0; i<Dim; i++){
for (c = 0; c < Dim; c++) {
printf("Result_ACC[%d][%d]= %G \n", i,c, Result_ACC[i][c]);
}
}
struct timeval mytime2;
gettimeofday(&mytime2,(struct timezone*)0);
tstop = (double)(mytime2.tv_sec + mytime2.tv_usec*1.0e-6);
printf ("Timer Stoped.Dim is %d Sum is %G \n", Dim, sum);
sum=0;
for (c = 0; c < Dim; c++) {
for (d = 0; d < Dim; d++) {
for (k = 0; k < Dim; k++) {
sum = sum + Matrix1[c][k]*Matrix2[k][d];
}
Result_OMP[c][d] = sum;
sum = 0;
}
}
for (i=0; i<Dim; i++){
for (c =0; c < Dim; c++) {
printf("Result_OMP[%d][%d]= %G \n",i,c, Result_OMP[i][c]);
}
}
for (i=0; i<Dim; i++){
for (c = 0; c < Dim; c++) {
Diff+= fabs(Result_OMP[i][c]-Result_ACC[i][c]);
}
}
printf ("The average difference is %0.15f and it took %G seconds.\n",Diff/(Dim*Dim), tstop-tstart);
return 0;
}
matrix dimension is 55 and I am expecting to have 55*4 bytes copied into device. But when I run the code I get this report on the terminal:
Result_ACC[0][0]= 0
Result_ACC[0][1]= 0
Result_ACC[0][2]= 0
Result_ACC[0][3]= 0
Result_ACC[0][4]= 0
Result_ACC[1][0]= 0
Result_ACC[1][1]= 0
Result_ACC[1][2]= 0
Result_ACC[1][3]= 0
Result_ACC[1][4]= 0
Result_ACC[2][0]= 0
Result_ACC[2][1]= 0
Result_ACC[2][2]= 0
Result_ACC[2][3]= 0
Result_ACC[2][4]= 0
Result_ACC[3][0]= 0
Result_ACC[3][1]= 0
Result_ACC[3][2]= 0
Result_ACC[3][3]= 0
Result_ACC[3][4]= 0
Result_ACC[4][0]= 0
Result_ACC[4][1]= 0
Result_ACC[4][2]= 0
Result_ACC[4][3]= 0
Result_ACC[4][4]= 0
Going to GPU.test is 0
upload CUDA data file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=Dim bytes=4
upload CUDA data file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=sum bytes=4
upload CUDA data file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=Matrix1 bytes=20
upload CUDA data file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=Matrix2 bytes=20
upload CUDA data file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=c bytes=4
upload CUDA data file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=d bytes=4
upload CUDA data file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=k bytes=4
launch CUDA kernel file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=40 device=0 num_gangs=1 num_workers=1 vector_length=1 grid=1 block=1
download CUDA data file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=48 device=0 variable=test bytes=4
download CUDA data file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=48 device=0 variable=Result_ACC bytes=20
Back from GPU.test is 2139095040
Result_ACC[0][0]= 5.95294
Result_ACC[0][1]= 0.647059
Result_ACC[0][2]= 7.75686
Result_ACC[0][3]= 0.843137
Result_ACC[0][4]= 4.01176
Result_ACC[1][0]= 0
Result_ACC[1][1]= 0
Result_ACC[1][2]= 0
Result_ACC[1][3]= 0
Result_ACC[1][4]= 0
Result_ACC[2][0]= 0
Result_ACC[2][1]= 0
Result_ACC[2][2]= 0
Result_ACC[2][3]= 0
Result_ACC[2][4]= 0
Result_ACC[3][0]= 0
Result_ACC[3][1]= 0
Result_ACC[3][2]= 0
Result_ACC[3][3]= 0
Result_ACC[3][4]= 0
Result_ACC[4][0]= 0
Result_ACC[4][1]= 0
Result_ACC[4][2]= 0
Result_ACC[4][3]= 0
Result_ACC[4][4]= 0
Timer Stoped.Dim is 5 Sum is 0
Result_OMP[0][0]= 31.7647
Result_OMP[0][1]= 25.3098
Result_OMP[0][2]= 26.9882
Result_OMP[0][3]= 22.6235
Result_OMP[0][4]= 16.4392
Result_OMP[1][0]= 22.1255
Result_OMP[1][1]= 21.7059
Result_OMP[1][2]= 20.2118
Result_OMP[1][3]= 18.9843
Result_OMP[1][4]= 8.85098
Result_OMP[2][0]= 14.1412
Result_OMP[2][1]= 10.4118
Result_OMP[2][2]= 13.5686
Result_OMP[2][3]= 9.68628
Result_OMP[2][4]= 6.45882
Result_OMP[3][0]= 20.7412
Result_OMP[3][1]= 18.1843
Result_OMP[3][2]= 19.902
Result_OMP[3][3]= 15.6431
Result_OMP[3][4]= 7.77255
Result_OMP[4][0]= 23.8588
Result_OMP[4][1]= 20.6627
Result_OMP[4][2]= 23.0275
Result_OMP[4][3]= 18.0549
Result_OMP[4][4]= 8.82745
Diff is 25.8118
Diff is 24.6627
Diff is 19.2314
Diff is 21.7804
Diff is 12.4275
Diff is 22.1255
Diff is 21.7059
Diff is 20.2118
Diff is 18.9843
Diff is 8.85098
Diff is 14.1412
Diff is 10.4118
Diff is 13.5686
Diff is 9.68628
Diff is 6.45882
Diff is 20.7412
Diff is 18.1843
Diff is 19.902
Diff is 15.6431
Diff is 7.77255
Diff is 23.8588
Diff is 20.6627
Diff is 23.0275
Diff is 18.0549
Diff is 8.82745
The average difference is 17.069334030151367 and it took 0.254893 seconds.
It only copies 20 bytes to the device. I don’t know what is wrong here. Even the test variable value that is copied back from device is not correct. I assign 10 to test in the kernel, but I read a weird value in CPU. I guess it has to do with the 2D arrays in the code.
Thanks,
Ali