I am a beginner in CUDA. I want to speed up my finite element program in Fortran by CUDA, which has been implemented before based on MPI.
But I encountered a basic problem in my first program, which multiplies two matrix simply.
I can compile and run the code correctly on device emulation mode, but when I turn to true GPU mode (Geforece 8800 GT), it failed.
I dont know the exact reasn.(may be double precision problem?)
Can anyone help me ? Thank you in advance.
The following is my source consiting two files, one is main.for , the other is MatrixMul.cu
( without considering efficiency yet, just for tesing the rightness)
Platform: Windows XP32bit Geforece 8800 GT, Intel Fortran compiler 10.1.019 Visual studio 2005 CUDA 2.1(beta)
Compile & link command
nvcc -c MatrixMul.cu -o MatrixMul.obj
ifort main.for MatrixMul.cu MatrixMul.obj c:\cuda\lib\cuda.lib c:\cuda\lib\cudart.lib
The FORTRAN part
c--------------------------------------------
c MatrixMul.for
c---------------------------------------------
PROGRAM MATRIXMUL
IMPLICIT NONE
INTEGER,PARAMETER::DIMA=4
INTEGER,PARAMETER::DIMB=5
INTEGER,PARAMETER::DIMC=6
DOUBLE PRECISION M(DIMA,DIMB),N(DIMB,DIMC),P(DIMA,DIMC),Q(DIMA,DIMC)
INTEGER I,J,K
DOUBLE PRECISION ERROR_MAX,ERROR_AVG
DO I=1,DIMB
DO J=1,DIMA
CALL RANDOM_NUMBER(M(J,I))
END DO
END DO
DO I=1,DIMC
DO J=1,DIMB
CALL RANDOM_NUMBER(N(J,I))
END DO
END DO
PRINT*,'M=',DIMA,'X',DIMB
DO I=1,DIMA
DO J=1,DIMB
WRITE(*,ADVANCE="NO",FMT=10)M(I,J)
END DO
WRITE(*,*)
END DO
PRINT*,'N=',DIMB,'X',DIMC
DO I=1,DIMB
DO J=1,DIMC
WRITE(*,ADVANCE="NO",FMT=10)N(I,J)
END DO
WRITE(*,*)
END DO
PRINT*,'EXECUTING ON HOST:'
CALL MatrixMulOnHost(M,N,P,DIMA,DIMB,DIMC)
PRINT*,'P=',DIMA,'X',DIMC
DO I=1,DIMA
DO J=1,DIMC
WRITE(*,ADVANCE="NO",FMT=10)P(I,J)
END DO
WRITE(*,*)
END DO
PRINT*,'EXECUTING ON Device:'
CALL MatrixMulOnDevice(M,N,Q,DIMA,DIMB,DIMC)
PRINT*,'Q=',DIMA,'X',DIMC
DO I=1,DIMA
DO J=1,DIMC
WRITE(*,ADVANCE="NO",FMT=10)Q(I,J)
END DO
WRITE(*,*)
END DO
C evaluate the errorness
CALL ERROR_EVAL(P,Q,DIMA,DIMC,ERROR_MAX,ERROR_AVG)
PRINT*,'ERROR MAX:',ERROR_MAX
PRINT*,'ERROR AVG:',ERROR_AVG
STOP
10 FORMAT(F10.2,1X)
END
SUBROUTINE MatrixMulOnHost(M,N,P,DIMA,DIMB,DIMC)
IMPLICIT NONE
INTEGER DIMA,DIMB,DIMC
DOUBLE PRECISION M(DIMA,DIMB)
DOUBLE PRECISION N(DIMB,DIMC)
DOUBLE PRECISION P(DIMA,DIMC)
INTEGER I,J,K
DOUBLE PRECISION A,B,C
DO I=1,DIMA
DO J=1,DIMC
C=0.0D0
DO K=1,DIMB
A=M(I,K)
B=N(K,J)
C=C+A*B
END DO
P(I,J)=C
END DO
END DO
RETURN
END
SUBROUTINE ERROR_EVAL(P,Q,DIMA,DIMC,ERROR_MAX,ERROR_AVG)
IMPLICIT NONE
INTEGER DIMA,DIMC
DOUBLE PRECISION P(DIMA,DIMC),Q(DIMA,DIMC)
DOUBLE PRECISION ERROR_MAX,ERROR_AVG
INTEGER I,J
DOUBLE PRECISION PE,QE
DOUBLE PRECISION ERROR
ERROR_MAX=0.0D0
ERROR_AVG=0.0D0
DO I=1,DIMA
DO J=1,DIMC
PE=P(I,J)
QE=Q(I,J)
ERROR=DABS(PE-QE)
IF(ERROR.GT.ERROR_MAX) ERROR_MAX=ERROR
ERROR_AVG=ERROR_AVG+ERROR
END DO
END DO
ERROR_AVG=ERROR_AVG/DIMA/DIMC
RETURN
END
//-------------------------------------------------------
// MatrixMul.cu
//
// the C interface and GPU kenrel part
//
#include <stdio.h>
global void MatrixMulKernel(const double *, const double *, double ,int ,int ,int, clock_t);
extern “C” void MATRIXMULONDEVICE(double* M,double* N,double* P,int* dimA,int* dimB,int* dimC)
{
double*Md,*Nd,*Pd;
int sizeM,sizeN,sizeP;
int DIMA,DIMB,DIMC;
DIMA=*dimA;
DIMB=*dimB;
DIMC=*dimC;
//calculate the size of memory
sizeM=DIMA*DIMB*sizeof(double);
sizeN=DIMB*DIMC*sizeof(double);
sizeP=DIMA*DIMC*sizeof(double);
//allocate the memory on device
cudaMalloc((void**)&Md,sizeM);
cudaMalloc((void**)&Nd,sizeN);
cudaMalloc((void**)&Pd,sizeP);
//send data from host to device
cudaMemcpy(Md,M,sizeM,cudaMemcpyHostToDevice);
cudaMemcpy(Nd,N,sizeN,cudaMemcpyHostToDevice);
//configure compute enviroment
dim3 dimGrid(1);
dim3 dimBlock(1);
//execute compute kernel
MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,DIMA,DIMB,DIMC);
//receive data from device to host
cudaMemcpy(P,Pd,sizeP,cudaMemcpyDeviceToHost);
//release device memory
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
}
global void MatrixMulKernel(const double *Md, const double *Nd, double *Pd,int DIMA,int DIMB,int DIMC)
{
const int tid = threadIdx.x;
double M,N,P;
for(int i=0;i<DIMA;i++)
for(int j=0;j<DIMC;j++)
{
P=0.0;
for(int k=0;k<DIMB;k++)
{
M=Md[k*DIMA+i];
N=Nd[j*DIMB+k];
P+=M*N;
}
Pd[j*DIMA+i]=P;
}
}