One Program break the program

Hellow Everyone.

How is the program parallelize better?

Data

[codebox]int NIJ=271;

double EPSVN;

F(NIJ,NIJ)

A(NIJ,NIJ)

B(NIJ,NIJ)

UUU(NIJ,NIJ)

AT(NIJ,NIJ)

BT(NIJ,NIJ)

C(NIJ,NIJ)

WUU(NIJ,NIJ)

GANJ(NIJ)

DNJ(NIJ)

ALTI1(NIJ)

BETI1(NIJ)

GATIN(NIJ)

DTIN(NIJ)

AF(NIJ)

BF(NIJ)

YPR(NIJ)

All massive already into device memory.

[/codebox]

Code

[codebox] ITERZ=0

  DO J=1,NIJ

  DO I=1,NIJ

    WUU(I,J)=UUU(I,J)

END DO

END DO

C===>

  OMEGA=0.5

2 ITERZ=ITERZ+1

    DO J=1,NIJ

     UUU(NI,J)=UUU(NI1,J)*GANJ(J)+DNJ(J)

    END DO

C===> LOOP ON THE LINES “I”

  DO 3 II=2,NIJ

  I=NI-II+1

C===> STEP1

  DO J=2,NJ1

    APR(J)=AT(I,J-1)

    BPR(J)=BT(I,J-1)

    CPR(J)=C(I,J-1)

    FPR(J)=F(I,J)+A(I,J)*UUU(I-1,J)+B(I,J)*UUU(I+1,J)

  END DO

C========================================>

  A1=ALTI1(I)

  B1=BETI1(I)

  A2=GATIN(I)

  B2=DTIN(I)

  AF(2)=A1

  BF(2)=B1

C

  DO L=2,NJ1

    Z=1./(CPR(L)-AF(L)*APR(L))

    AF(L+1)=BPR(L+1)*Z

    BF(L+1)=(APR(L+1)*BF(L)+FPR(L))*Z

  END DO

C

  YPR(NJ)=(B2+A2*BF(NJ))/(1.-A2*AF(NJ-2))

C

  DO LL=2,NJ

    L=NJ-LL+1

    YPR(L)=AF(L-1)*YPR(L+1)+BF(L-1)

  END DO

C===> PROGONKA: END

C========================================>

C===> STEP2

  DO J=1,NIJ

    UUU(I,J)=YPR(J)*OMEGA+(1.-OMEGA)*WUU(I,J)

  END DO

3 CONTINUE

C==>

  DO J=1,NIJ

    UUU(1,J)=UUU(2,J)*AL1J(J)+BE1J(J)

  END DO

C===>

  EPSZ=0.

  DO J=1,NIJ

  DO I=1,NIJ

  WZ=ABS(UUU(I,J))

  IF(WZ.GT.1.0E-05) EPSZ=DMAX1(EPSZ,ABS((UUU(I,J)-WUU(I,J))/WZ))

  WUU(I,J)=UUU(I,J)

END DO

END DO

C===>

  IF(EPSZ.GT.EPSVN) GOTO 2

[/codebox]

I think to break the program for 5 part:

//1 Part///////////////////////

[codebox] DO J=1,NIJ

  DO I=1,NIJ

    WUU(I,J)=UUU(I,J)

END DO

END DO

[/codebox]

//2 PART/////////////////////////

[codebox] DO J=1,NIJ

     UUU(NI,J)=UUU(NI1,J)*GANJ(J)+DNJ(J)

    END DO

[/codebox]

//3 PART//////////////////////////////////

[codebox]C===> LOOP ON THE LINES “I”

  DO 3 II=2,NIJ

  I=NI-II+1

C===> STEP1

  DO J=2,NJ1

    APR(J)=AT(I,J-1)

    BPR(J)=BT(I,J-1)

    CPR(J)=C(I,J-1)

    FPR(J)=F(I,J)+A(I,J)*UUU(I-1,J)+B(I,J)*UUU(I+1,J)

  END DO

C========================================>

  A1=ALTI1(I)

  B1=BETI1(I)

  A2=GATIN(I)

  B2=DTIN(I)

  AF(2)=A1

  BF(2)=B1

C

  DO L=2,NJ1

    Z=1./(CPR(L)-AF(L)*APR(L))

    AF(L+1)=BPR(L+1)*Z

    BF(L+1)=(APR(L+1)*BF(L)+FPR(L))*Z

  END DO

C

  YPR(NJ)=(B2+A2*BF(NJ))/(1.-A2*AF(NJ-2))

C

  DO LL=2,NJ

    L=NJ-LL+1

    YPR(L)=AF(L-1)*YPR(L+1)+BF(L-1)

  END DO

C===> PROGONKA: END

C========================================>

C===> STEP2

  DO J=1,NIJ

    UUU(I,J)=YPR(J)*OMEGA+(1.-OMEGA)*WUU(I,J)

  END DO

3 CONTINUE

[/codebox]

//4 PARt//////////////////////////////////////

[codebox] DO J=1,NIJ

    UUU(1,J)=UUU(2,J)*AL1J(J)+BE1J(J)

  END DO

[/codebox]

//5 PART//////////////////////////////////////

[codebox] DO J=1,NIJ

  DO I=1,NIJ

  WZ=ABS(UUU(I,J))

  IF(WZ.GT.1.0E-05) EPSZ=DMAX1(EPSZ,ABS((UUU(I,J)-WUU(I,J))/WZ))

  WUU(I,J)=UUU(I,J)

END DO

END DO

[/codebox]

Kernels Launch will be:

[codebox] Kernel_blsor_1<<<blocks_module_blsor_2, threads_module_blsor_2>>>(NJ,NI,

                                                                    WUU_CU,UUU_CU);	

do

{

	ITERZ=ITERZ+1;

Kernel_blsor_2<<<blocks_module_blsor_4, threads_module_blsor_4>>>(NJ,NI,NIJ, UUU_CU,GANJ_CU,DNJ_CU);

Kernel_blsor_3<<<blocks_module_blsor_2, threads_module_blsor_2>>>(NJ,NI,NIJ,

		ALTI1_CU,BETI1_CU,GATIN_CU,DTIN_CU,          APR_CU,BPR_CU,CPR_CU,FPR_CU,AT_CU,BT_CU,C_CU,	  F_CU,A_CU,UUU_CU,B_CU,AF_CU,BF_CU,YPR_CU,WUU_CU);

Kernel_blsor_4<<<blocks_module_blsor_4, threads_module_blsor_4>>>(NJ,NI,NIJ, WUU_CU,UUU_CU,AL1J_CU,BE1J_CU);

Kernel_blsor_5<<<blocks_module_blsor_2, threads_module_blsor_2>>>(NJ,NI,NIJ,

	EPSZ,WUU_CU,UUU_CU);

}

while(EPSZ<=EPSVN);

[/codebox]

P S In 5 part, problems connected with transfer EPSZ (DEVICE TO HOST).

How is better make?