Hardware
GeForce GTX 260
Software
Intel Fortran 11.066
Windows XP 32x Pro
Visual Studio 2008 Russian Team
[codebox]
FLUIDS.FOR
VV(NIJ,NIJ),UU(NIJ,NIJ),VVZ(NIJ,NIJ),UUZ(NIJ,NIJ)
VX(NIJ,NIJ),VR(NIJ,NIJ) XKSI(NIJ,NIJ),YKSI(NIJ,NIJ)
XETA(NIJ,NIJ),YETA(NIJ,NIJ)
SI(NIJ,NIJ),ET(NIJ,NIJ)
……………………
!All massive have size(NIJ,NIJ)
!In these massive some data contain
……………………………………………………
………………………………
……………………………………………………
……………………………….
SUBROUTINE ForceTenzor
INTERFACE
SUBROUTINE FORCETENZOR_CU(NJ,NI,NJ1,NI1,NIJ,VX,XKSI,VR,YKSI,XETA,
1 YETA,Y,SI,ET,YAC,UU,VV,VISC,ALFA,UUZ,VVZ,VRDY,TXX,
1 TXY,TYY,EPVR)
! Specify C calling and naming conventions
!DEC$ ATTRIBUTES C :: FORCETENZOR_CU
INTEGER, INTENT(IN) :: NJ,NI,NJ1,NI1,NIJ
real*4 VX(1,1), XKSI(1,1), VR(1,1), YKSI(1,1), XETA(1,1),
1 YETA(1,1), Y(1,1), SI(1,1), ET(1,1),YAC(1,1), UU(1,1),
1 VV(1,1), VISC(1,1), ALFA, UUZ(1,1),VVZ(1,1),VRDY(1,1),
1 TXX(1,1),TXY(1,1),TYY(1,1),EPVR(1,1)
character*50 argv
END SUBROUTINE FORCETENZOR_CU
END INTERFACE
……………………………………………………
…………………………
……………………………………………………
………………………….
CALL FORCETENZOR_CU(NJ,NI,NJ1,NI1,NIJ,VX,XKSI,VR,YKSI,XETA,YETA,
1 Y,SI,ET,YAC,UU,VV,VISC,ALFA,UUZ,VVZ,VRDY,
1 TXX,TXY,TYY,EPVR)
……………………………………………………
……………………………
RETURN
END
FuncVx2.cu
extern “C” void forcetenzor_cu(int NJ,int NI,int NJ1,int NI1,int NIJ,float *VX,float *XKSI,float *VR, float *YKSI,float
*XETA,float *YETA,float *Y, float *SI,float *ET,float *YAC,float *UU, float *VV,float *VISC, float ALFA,float *UUZ,float
*VVZ,float *VRDY, float *TXX, float *TXY,float *TYY,float *EPVR)
{
……………………………………………………
……………………………
……………………………………………………
……………………………
//NJ=701,NJ1=700,NI=21,NI1=20,NIJ=741
for (int i=1;i<NIJ;i++)
{
for (int j=1;j<NIJ;j++)
{
int smesh=(i-1)*NIJ+(j-1);
TXX[smesh]=0;
TXY[smesh]=0;
TYY[smesh]=0;
EPVR[smesh]=0;
}
}
int numBytes_module2 = NIJ*NIJ*sizeof ( float );
cudaEvent_t start_module2, stop_module2;
float gpuTime_module2 = 0.0f;
cudaEventCreate ( &start_module2 );
cudaEventCreate ( &stop_module2 );
cudaEventRecord ( start_module2, 0 );
//////////////////////////////////
float *SI_CU;
cudaMalloc ( (void**)&SI_CU, numBytes_module2 );
float *ET_CU;
cudaMalloc ( (void**)&ET_CU, numBytes_module2 );
float *YAC_CU;
cudaMalloc ( (void**)&YAC_CU, numBytes_module2 );
float *VISC_CU;
cudaMalloc ( (void**)&VISC_CU, numBytes_module2 );
float *VRDY_CU;
cudaMalloc ( (void**)&VRDY_CU, numBytes_module2 );
float *Y_CU;
cudaMalloc ( (void**)&Y_CU, numBytes_module2 );
float *TXX_CU;
cudaMalloc ( (void**)&TXX_CU, numBytes_module2 );
float *TXY_CU;
cudaMalloc ( (void**)&TXY_CU, numBytes_module2 );
float *TYY_CU;
cudaMalloc ( (void**)&TYY_CU, numBytes_module2 );
float *EPVR_CU;
cudaMalloc ( (void**)&EPVR_CU, numBytes_module2 );
//////////////////////////////////
cudaMemcpy(SI_CU, SI, numBytes_module2, cudaMemcpyHostToDevice);
cudaMemcpy(ET_CU, ET, numBytes_module2, cudaMemcpyHostToDevice);
cudaMemcpy(YAC_CU, YAC, numBytes_module2, cudaMemcpyHostToDevice);
cudaMemcpy(VISC_CU, VISC, numBytes_module2, cudaMemcpyHostToDevice);
cudaMemcpy(VRDY_CU, VRDY, numBytes_module2, cudaMemcpyHostToDevice);
cudaMemcpy(Y_CU, Y, numBytes_module2, cudaMemcpyHostToDevice);
cudaMemcpy(TXX_CU, TXX, numBytes_module2, cudaMemcpyHostToDevice);
cudaMemcpy(TXY_CU, TXY, numBytes_module2, cudaMemcpyHostToDevice);
cudaMemcpy(TYY_CU, TYY, numBytes_module2, cudaMemcpyHostToDevice);
cudaMemcpy(EPVR_CU, EPVR, numBytes_module2, cudaMemcpyHostToDevice);
//////////////////////////////////
dim3 threads_module2(256, 1);
dim3 blocks_module2(NIJ*NIJ / threads_module2.x, 1);
Kernel_module2<<<blocks_module2, threads_module2>>>(NJ1, NI1, NIJ, ALFA, SI_CU, ET_CU, YAC_CU,VISC_CU,
VRDY_CU, TXX_CU,TXY_CU,TYY_CU,EPVR_CU, XKSI_CU,VX_CU,XETA_CU,UU_CU,VV_CU, VR_CU, Y_CU, UUZ_CU,VVZ_CU,YKSI_CU,YETA_CU);
cudaMemcpy(TXX, TXX_CU, numBytes_module2, cudaMemcpyDeviceToHost);
cudaMemcpy(TXY, TXY_CU, numBytes_module2, cudaMemcpyDeviceToHost);
cudaMemcpy(TYY, TYY_CU, numBytes_module2, cudaMemcpyDeviceToHost);
cudaMemcpy(EPVR, EPVR_CU, numBytes_module2, cudaMemcpyDeviceToHost);
cudaEventRecord ( stop_module2, 0 );
cudaEventSynchronize ( stop_module2 );
cudaEventElapsedTime ( &gpuTime_module2, start_module2, stop_module2 );
// print the cpu and gpu times
printf("time spent executing by the GPU module 2+3+4: %.10f millseconds\n", gpuTime_module2 );
……………………………………………………
…………
……………………………………………………
………….
}
Kernel_module2 (int NJ1,int NI1,int NIJ, float ALFA, float *SI_CU,float *ET_CU,float *YAC_CU,
float *VISC_CU,float *VRDY_CU,
float *TXX_CU,float *TXY_CU,float *TYY_CU, float *EPVR_CU,
float *XKSI_CU,float *VX_CU,float *XETA_CU, float *UU_CU,
float *VV_CU, float *VR_CU, float *Y_CU,float *UUZ_CU,
float *VVZ_CU, float *YKSI_CU, float *YETA_CU)
{
int idx_module2_i_j = blockIdx.x * blockDim.x + threadIdx.x;
int idx_module2_i_jMIN1 = blockIdx.x * blockDim.x + (threadIdx.x-1);
int idx_module2_i_jPLUS1 = blockIdx.x * blockDim.x + (threadIdx.x+1);
int idx_module2_iMIN1_j = blockIdx.x * blockDim.x + threadIdx.x - NIJ;
int idx_module2_iPLUS1_j = blockIdx.x * blockDim.x + threadIdx.x + NIJ;
__shared__ float Q;
__shared__ float P;
__shared__ float TXX1;
__shared__ float TXX2;
__shared__ float TXX3;
__shared__ float TYY1;
__shared__ float TYY2;
__shared__ float TYY3;
Q=0;
P=0;
TXX1=0;
TXX2=0;
TXX3=0;
TYY1=0;
TYY2=0;
TYY3=0;
Q=SI_CU[idx_module2_i_jPLUS1]-SI_CU[idx_module2_i_jMIN1];
P=ET_CU[idx_module2_iPLUS1_j]-ET_CU[idx_module2_iMIN1_j];
__syncthreads();
TXX1= 2* ((XKSI_CU[idx_module2_i_jPLUS1]*VX_CU[idx_module2_i_jPLUS1]/YAC_CU[idx_module2_i_jPLUS1]- XKSI_CU
[idx_module2_i_jMIN1]* VX_CU[idx_module2_i_jMIN1]/YAC_CU[idx_module2_i_jMIN1])/Q+
(XETA_CU[idx_module2_iPLUS1_j]*VX_CU[idx_module2_iPLUS1_j]/YAC_CU[idx_module2_iPLUS1_j]-XETA_CU
[idx_module2_iMIN1_j]* VX_CU[idx_module2_iMIN1_j]/YAC_CU[idx_module2_iMIN1_j])/P);
TXX2=2/3*((UU_CU[idx_module2_i_jPLUS1]/YAC_CU[idx_module2_i_jPLUS1]-
UU_CU[idx_module2_i_jMIN1]/YAC_CU[idx_module2_i_jMIN1])/Q+
(VV_CU[idx_module2_iPLUS1_j]/YAC_CU[idx_module2_iPLUS1_j]-
VV_CU[idx_module2_iMIN1_j]/YAC_CU[idx_module2_iMIN1_j])/P);
TXX3=2/3*VR_CU[idx_module2_i_j]/(YAC_CU[idx_module2_i_j]*Y_CU[idx_module2_i_j]);
__syncthreads();
TXX_CU[idx_module2_i_j]=YAC_CU[idx_module2_i_j]*VISC_CU[idx_
module2_i_j](TXX1-TXX2-ALFATXX3);
//Original code in fortran
/* DO 1 J=2,NIJ
DO 1 I=2,NIJ
Q=SI(I,J+1)-SI(I,J-1)
P=ET(I+1,J)-ET(I-1,J)
C===>
TXX1=2.*((XKSI(I,J+1)*VX(I,J+1)/YAC(I,J+1)-
- XKSI(I,J-1)*VX(I,J-1)/YAC(I,J-1))/Q+
+ (XETA(I+1,J)*VX(I+1,J)/YAC(I+1,J)-
- XETA(I-1,J)*VX(I-1,J)/YAC(I-1,J))/P)
C=>
TXX2=2./3.*((UU(I,J+1)/YAC(I,J+1)-UU(I,J-1)/YAC(I,J-1))/Q+
+ (VV(I+1,J)/YAC(I+1,J)-VV(I-1,J)/YAC(I-1,J))/P)
C=>
TXX3=2./3.*VR(I,J)/(YAC(I,J)*Y(I,J))
C=>
TXX(I,J)=YAC(I,J)*VISC(I,J)*(TXX1-TXX2-ALFA*TXX3)
ENDDO
ENDDO
*/
//And other hard calculating
……………………………………………………
………………………………
[/codebox]
Problem:
Function Kernel_module2 return massive TXX,TXY, EPVR, TYY
But this Massive contain after return ----“NANâ€