Hello,
I have some code that produces either an ICE or error in the .gpu code (which I assume is like object code for the GPU). It has to do with passing a character type as a parameter to another device routine. If I instead try to pass the character constant directly (see the commented call to sgemv in the code below), I get an ICE: unexpected address of constant. Otherwise, I get errors in the .gpu like this:
/tmp/pgcudafor25EgI8rjAavK.gpu(43): error: expected an expression
/tmp/pgcudafor25EgI8rjAavK.gpu(44): error: expected an expression
/tmp/pgcudafor25EgI8rjAavK.gpu(45): error: expected an expression
/tmp/pgcudafor25EgI8rjAavK.gpu(46): error: too many arguments in function call
…
My suspicion is it is an error in the compiler. I’m using version 10.6.
Here’s the offending code:
MODULE imsim_module
use cudafor
contains
subroutine kernel_setup(pixxwidth, pixywidth)
real :: pixxwidth, pixywidth
real, device :: pixxwidthDev, pixywidthDev
type(dim3) :: dimGrid, dimBlock
pixxwidthDev = pixxwidth
pixywidthDev = pixywidth
dimGrid = dim3( detxsize/16, detysize/16, 1 )
dimBlock = dim3( 16, 16, 1 )
call kernel<<<dimGrid,dimBlock>>>(pixxwidthDev, pixywidthDev)
end subroutine kernel_setup
attributes(device) LOGICAL FUNCTION MYLSAME( CA, CB )
CHARACTER CA, CB
MYLSAME = CA.EQ.CB
RETURN
END
attributes(device) SUBROUTINE SGEMV ( TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY )
REAL ALPHA, BETA
INTEGER INCX, INCY, LDA, M, N
CHARACTER1 TRANS
REAL A( LDA, * ), X( * ), Y( * )
REAL ONE , ZERO
PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 )
REAL TEMP
INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY
LOGICAL MYLSAME
INTRINSIC MAX
CHARACTER1 NN,TT,CC
NN = ‘N’
TT = ‘T’
CC = ‘C’
INFO = 0
IF ( .NOT.MYLSAME( TRANS, NN ).AND…NOT.MYLSAME( TRANS, TT ).AND…NOT.MYLSAME( TRANS, CC ) )THEN
INFO = 1
ELSE IF( M.LT.0 )THEN
INFO = 2
ELSE IF( N.LT.0 )THEN
INFO = 3
ELSE IF( LDA.LT.MAX( 1, M ) )THEN
INFO = 6
ELSE IF( INCX.EQ.0 )THEN
INFO = 8
ELSE IF( INCY.EQ.0 )THEN
INFO = 11
END IF
IF( INFO.NE.0 )THEN
RETURN
END IF
IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) RETURN
IF( MYLSAME( TRANS, NN ) )THEN
LENX = N
LENY = M
ELSE
LENX = M
LENY = N
END IF
IF( INCX.GT.0 )THEN
KX = 1
ELSE
KX = 1 - ( LENX - 1 )INCX
END IF
IF( INCY.GT.0 )THEN
KY = 1
ELSE
KY = 1 - ( LENY - 1 )INCY
END IF
IF( BETA.NE.ONE )THEN
IF( INCY.EQ.1 )THEN
IF( BETA.EQ.ZERO )THEN
DO 10, I = 1, LENY
Y( I ) = ZERO
10 CONTINUE
ELSE
DO 20, I = 1, LENY
Y( I ) = BETAY( I )
20 CONTINUE
END IF
ELSE
IY = KY
IF( BETA.EQ.ZERO )THEN
DO 30, I = 1, LENY
Y( IY ) = ZERO
IY = IY + INCY
30 CONTINUE
ELSE
DO 40, I = 1, LENY
Y( IY ) = BETAY( IY )
IY = IY + INCY
40 CONTINUE
END IF
END IF
END IF
IF( ALPHA.EQ.ZERO ) RETURN
IF( MYLSAME( TRANS, NN ) )THEN
JX = KX
IF( INCY.EQ.1 )THEN
DO 60, J = 1, N
IF( X( JX ).NE.ZERO )THEN
TEMP = ALPHAX( JX )
DO 50, I = 1, M
Y( I ) = Y( I ) + TEMPA( I, J )
50 CONTINUE
END IF
JX = JX + INCX
60 CONTINUE
ELSE
DO 80, J = 1, N
IF( X( JX ).NE.ZERO )THEN
TEMP = ALPHAX( JX )
IY = KY
DO 70, I = 1, M
Y( IY ) = Y( IY ) + TEMPA( I, J )
IY = IY + INCY
70 CONTINUE
END IF
JX = JX + INCX
80 CONTINUE
END IF
ELSE
JY = KY
IF( INCX.EQ.1 )THEN
DO 100, J = 1, N
TEMP = ZERO
DO 90, I = 1, M
TEMP = TEMP + A( I, J )X( I )
90 CONTINUE
Y( JY ) = Y( JY ) + ALPHATEMP
JY = JY + INCY
100 CONTINUE
ELSE
DO 120, J = 1, N
TEMP = ZERO
IX = KX
DO 110, I = 1, M
TEMP = TEMP + A( I, J )X( IX )
IX = IX + INCX
110 CONTINUE
Y( JY ) = Y( JY ) + ALPHATEMP
JY = JY + INCY
120 CONTINUE
END IF
END IF
RETURN
END
attributes(global) subroutine kernel(pixxwidth, pixywidth)
real :: pixxwidth, pixywidth
character, device :: TT
real, device :: A(4,2), X(2), Y(2)
real, device :: alpha, beta
alpha = 1.0
beta = 1.0
TT = ‘T’
! call sgemv(‘T’, 4, 2, alpha, A, 4, X, 1, beta, Y, 1 )
call sgemv(TT, 4, 2, alpha, A, 4, X, 1, beta, Y, 1 )
end subroutine kernel
END MODULE imsim_module
On another note, I’ve been working with some code that I’m porting from F77 code. The code uses BLAS/LAPACK/other netlib routines that use optimized machine constant code (e.g., r1mach.f, d1mach.f, etc. off of netlib). Is there some existing code for the CUDA Fortran that allows me to optimize my routines to the GPU architecture with machine constants?
Thanks.