Cuda and Fortran Nan from Kernel

Hardware

GeForce GTX 260

Software

Intel Fortran 11.066

Windows XP 32x Pro

Visual Studio 2008 Russian Team

[codebox]

FLUIDS.FOR

VV(NIJ,NIJ),UU(NIJ,NIJ),VVZ(NIJ,NIJ),UUZ(NIJ,NIJ)

VX(NIJ,NIJ),VR(NIJ,NIJ) XKSI(NIJ,NIJ),YKSI(NIJ,NIJ)

  XETA(NIJ,NIJ),YETA(NIJ,NIJ)

SI(NIJ,NIJ),ET(NIJ,NIJ)

……………………

!All massive have size(NIJ,NIJ)

!In these massive some data contain

……………………………………………………

………………………………

……………………………………………………

……………………………….

SUBROUTINE ForceTenzor

      INTERFACE

  SUBROUTINE FORCETENZOR_CU(NJ,NI,NJ1,NI1,NIJ,VX,XKSI,VR,YKSI,XETA,

 1  YETA,Y,SI,ET,YAC,UU,VV,VISC,ALFA,UUZ,VVZ,VRDY,TXX,

 1  TXY,TYY,EPVR)

      ! Specify C calling and naming conventions

      !DEC$ ATTRIBUTES C :: FORCETENZOR_CU

INTEGER, INTENT(IN) :: NJ,NI,NJ1,NI1,NIJ

      real*4 VX(1,1), XKSI(1,1), VR(1,1), YKSI(1,1), XETA(1,1), 

 1    YETA(1,1), Y(1,1), SI(1,1), ET(1,1),YAC(1,1), UU(1,1),

 1    VV(1,1), VISC(1,1), ALFA, UUZ(1,1),VVZ(1,1),VRDY(1,1),

 1    TXX(1,1),TXY(1,1),TYY(1,1),EPVR(1,1)

      character*50 argv

      END SUBROUTINE FORCETENZOR_CU  

END INTERFACE

……………………………………………………

…………………………

……………………………………………………

………………………….

  CALL FORCETENZOR_CU(NJ,NI,NJ1,NI1,NIJ,VX,XKSI,VR,YKSI,XETA,YETA,

 1                      Y,SI,ET,YAC,UU,VV,VISC,ALFA,UUZ,VVZ,VRDY,

1 TXX,TXY,TYY,EPVR)

……………………………………………………

……………………………

  RETURN

  END

FuncVx2.cu

extern “C” void forcetenzor_cu(int NJ,int NI,int NJ1,int NI1,int NIJ,float *VX,float *XKSI,float *VR, float *YKSI,float

*XETA,float *YETA,float *Y, float *SI,float *ET,float *YAC,float *UU, float *VV,float *VISC, float ALFA,float *UUZ,float

*VVZ,float *VRDY, float *TXX, float *TXY,float *TYY,float *EPVR)

{

……………………………………………………

……………………………

……………………………………………………

……………………………

//NJ=701,NJ1=700,NI=21,NI1=20,NIJ=741

for (int i=1;i<NIJ;i++)

{

for (int j=1;j<NIJ;j++)

{

  int smesh=(i-1)*NIJ+(j-1);

TXX[smesh]=0;

TXY[smesh]=0;

TYY[smesh]=0;

EPVR[smesh]=0;

}

}

int numBytes_module2 = NIJ*NIJ*sizeof ( float );

cudaEvent_t start_module2, stop_module2;

float gpuTime_module2 = 0.0f;

cudaEventCreate ( &start_module2 );

cudaEventCreate ( &stop_module2 );

cudaEventRecord ( start_module2, 0 );

//////////////////////////////////

float *SI_CU;

  cudaMalloc ( (void**)&SI_CU, numBytes_module2 );

float *ET_CU;

  cudaMalloc ( (void**)&ET_CU, numBytes_module2 );

float *YAC_CU;

cudaMalloc ( (void**)&YAC_CU, numBytes_module2 );

float *VISC_CU;

cudaMalloc ( (void**)&VISC_CU, numBytes_module2 );

float *VRDY_CU;

cudaMalloc ( (void**)&VRDY_CU, numBytes_module2 );

float *Y_CU;

cudaMalloc ( (void**)&Y_CU, numBytes_module2 );

float *TXX_CU;

cudaMalloc ( (void**)&TXX_CU, numBytes_module2 );

float *TXY_CU;

cudaMalloc ( (void**)&TXY_CU, numBytes_module2 );

float *TYY_CU;

cudaMalloc ( (void**)&TYY_CU, numBytes_module2 );

float *EPVR_CU;

cudaMalloc ( (void**)&EPVR_CU, numBytes_module2 );

//////////////////////////////////

cudaMemcpy(SI_CU, SI, numBytes_module2, cudaMemcpyHostToDevice);

cudaMemcpy(ET_CU, ET, numBytes_module2, cudaMemcpyHostToDevice);

cudaMemcpy(YAC_CU, YAC, numBytes_module2, cudaMemcpyHostToDevice);

cudaMemcpy(VISC_CU, VISC, numBytes_module2, cudaMemcpyHostToDevice);

cudaMemcpy(VRDY_CU, VRDY, numBytes_module2, cudaMemcpyHostToDevice);

cudaMemcpy(Y_CU, Y, numBytes_module2, cudaMemcpyHostToDevice);

cudaMemcpy(TXX_CU, TXX, numBytes_module2, cudaMemcpyHostToDevice);

cudaMemcpy(TXY_CU, TXY, numBytes_module2, cudaMemcpyHostToDevice);

cudaMemcpy(TYY_CU, TYY, numBytes_module2, cudaMemcpyHostToDevice);

cudaMemcpy(EPVR_CU, EPVR, numBytes_module2, cudaMemcpyHostToDevice);

//////////////////////////////////

dim3 threads_module2(256, 1);

dim3 blocks_module2(NIJ*NIJ / threads_module2.x, 1);

Kernel_module2<<<blocks_module2, threads_module2>>>(NJ1, NI1, NIJ, ALFA, SI_CU, ET_CU, YAC_CU,VISC_CU,

VRDY_CU, TXX_CU,TXY_CU,TYY_CU,EPVR_CU, XKSI_CU,VX_CU,XETA_CU,UU_CU,VV_CU, VR_CU, Y_CU, UUZ_CU,VVZ_CU,YKSI_CU,YETA_CU);

cudaMemcpy(TXX, TXX_CU, numBytes_module2, cudaMemcpyDeviceToHost);

cudaMemcpy(TXY, TXY_CU, numBytes_module2, cudaMemcpyDeviceToHost);

cudaMemcpy(TYY, TYY_CU, numBytes_module2, cudaMemcpyDeviceToHost);

cudaMemcpy(EPVR, EPVR_CU, numBytes_module2, cudaMemcpyDeviceToHost);

cudaEventRecord ( stop_module2, 0 );

cudaEventSynchronize ( stop_module2 );

cudaEventElapsedTime ( &gpuTime_module2, start_module2, stop_module2 );

                        // print the cpu and gpu times

printf("time spent executing by the GPU module 2+3+4: %.10f millseconds\n", gpuTime_module2 );

……………………………………………………

…………

……………………………………………………

………….

}

Kernel_module2 (int NJ1,int NI1,int NIJ, float ALFA, float *SI_CU,float *ET_CU,float *YAC_CU,

float *VISC_CU,float *VRDY_CU,

float *TXX_CU,float *TXY_CU,float *TYY_CU, float *EPVR_CU,

float *XKSI_CU,float *VX_CU,float *XETA_CU, float *UU_CU,

float *VV_CU, float *VR_CU, float *Y_CU,float *UUZ_CU,

float *VVZ_CU, float *YKSI_CU, float *YETA_CU)

{

int idx_module2_i_j = blockIdx.x * blockDim.x + threadIdx.x;

int idx_module2_i_jMIN1 = blockIdx.x * blockDim.x + (threadIdx.x-1);

int idx_module2_i_jPLUS1 = blockIdx.x * blockDim.x + (threadIdx.x+1);

int idx_module2_iMIN1_j = blockIdx.x * blockDim.x + threadIdx.x - NIJ;

int idx_module2_iPLUS1_j = blockIdx.x * blockDim.x + threadIdx.x + NIJ;

__shared__ float Q;

__shared__ float P;

__shared__ float TXX1;

__shared__ float TXX2;

__shared__ float TXX3;

__shared__ float TYY1;

__shared__ float TYY2;

__shared__ float TYY3;

Q=0;

P=0;

TXX1=0;

TXX2=0;

TXX3=0;

TYY1=0;

TYY2=0;

TYY3=0;

    Q=SI_CU[idx_module2_i_jPLUS1]-SI_CU[idx_module2_i_jMIN1];

    P=ET_CU[idx_module2_iPLUS1_j]-ET_CU[idx_module2_iMIN1_j];

__syncthreads();

TXX1= 2* ((XKSI_CU[idx_module2_i_jPLUS1]*VX_CU[idx_module2_i_jPLUS1]/YAC_CU[idx_module2_i_jPLUS1]- XKSI_CU

[idx_module2_i_jMIN1]* VX_CU[idx_module2_i_jMIN1]/YAC_CU[idx_module2_i_jMIN1])/Q+

(XETA_CU[idx_module2_iPLUS1_j]*VX_CU[idx_module2_iPLUS1_j]/YAC_CU[idx_module2_iPLUS1_j]-XETA_CU

[idx_module2_iMIN1_j]* VX_CU[idx_module2_iMIN1_j]/YAC_CU[idx_module2_iMIN1_j])/P);

TXX2=2/3*((UU_CU[idx_module2_i_jPLUS1]/YAC_CU[idx_module2_i_jPLUS1]-

UU_CU[idx_module2_i_jMIN1]/YAC_CU[idx_module2_i_jMIN1])/Q+

(VV_CU[idx_module2_iPLUS1_j]/YAC_CU[idx_module2_iPLUS1_j]-

VV_CU[idx_module2_iMIN1_j]/YAC_CU[idx_module2_iMIN1_j])/P);

TXX3=2/3*VR_CU[idx_module2_i_j]/(YAC_CU[idx_module2_i_j]*Y_CU[idx_module2_i_j]);

	__syncthreads();

TXX_CU[idx_module2_i_j]=YAC_CU[idx_module2_i_j]*VISC_CU[idx_

module2_i_j](TXX1-TXX2-ALFATXX3);

//Original code in fortran

/* DO 1 J=2,NIJ

  DO 1 I=2,NIJ

    Q=SI(I,J+1)-SI(I,J-1)

    P=ET(I+1,J)-ET(I-1,J)

C===>

  TXX1=2.*((XKSI(I,J+1)*VX(I,J+1)/YAC(I,J+1)-

 -          XKSI(I,J-1)*VX(I,J-1)/YAC(I,J-1))/Q+

 +         (XETA(I+1,J)*VX(I+1,J)/YAC(I+1,J)-

 -          XETA(I-1,J)*VX(I-1,J)/YAC(I-1,J))/P)

C=>

  TXX2=2./3.*((UU(I,J+1)/YAC(I,J+1)-UU(I,J-1)/YAC(I,J-1))/Q+

 +            (VV(I+1,J)/YAC(I+1,J)-VV(I-1,J)/YAC(I-1,J))/P)

C=>

  TXX3=2./3.*VR(I,J)/(YAC(I,J)*Y(I,J))

C=>

  TXX(I,J)=YAC(I,J)*VISC(I,J)*(TXX1-TXX2-ALFA*TXX3)

  ENDDO

  ENDDO  

*/

//And other hard calculating

……………………………………………………

………………………………

[/codebox]

Problem:

Function Kernel_module2 return massive TXX,TXY, EPVR, TYY

But this Massive contain after return ----“NAN”