copyout Memcpy failed, unspecified launch failure

hi everyone!
I am using cuda fortran (pgi 11.8) on Testla M2050. I encounter a very stranger error:

0: copyout Memcpy (host=0x6c70d0, dev=0xa140000, size=16) FAILED: 4(unspecified launch failure)

It seems that the data transfer from device to host was wrong. My code is very simple (see below). If i change the type of b1 from an array to a scalar, there is no error.

attributes(global) subroutine addkernel(A)
integer,intent(inout)::A(:,:)
integer::b1(1:2)
b1=2
A(1,1)=A(1,1)+b1(1)
end subroutine

program main
use cudafor
implicit none

integer,device,allocatable::A_d(:,:)
integer,allocatable::A(:,:)
type(dim3)::dimGrid,dimBlock

dimGrid=dim3(1,1,1)
dimBlock=dim3(1,1,1)
allocate(A(2,2))
A=2
allocate(A_d(2,2))
A_d=A
call addkernel<<>>(A_d)
A=A_d
end program main

anybody can tell me what is wrong??
thank you in advance

It seems that if i change “integer::b1(1:2)” to “integer,shared::b1(1:2)”,it will be ok. We cannot use registers to store an array in cuda fortran?

Hi Steve,

The problem here is that you don’t have an interface to addkernel. An explicit or implicit interface is required when calling device routines and failure to include one can lead to undefined behaviour.

There are two ways to fix your code. Either put the global routine into a module:

% cat test.cuf 
module foo

contains

attributes(global) subroutine addkernel(A)
integer,intent(inout)::A(:,:)
integer::b1(1:2)
b1=2
A(1,1)=A(1,1)+b1(1)
end subroutine

end module foo

program main
use cudafor
use foo
implicit none

integer,device,allocatable::A_d(:,:)
integer,allocatable::A(:,:)
type(dim3)::dimGrid,dimBlock

dimGrid=dim3(1,1,1)
dimBlock=dim3(1,1,1)
allocate(A(2,2))
A=2
allocate(A_d(2,2))
A_d=A
! Note add in dimBlock after dimGrid, since our UF software will filter it out. 
call addkernel<<<dimGrid>>>(A_d)  
A=A_d
print *, A
end program main 
% pgf90 test.cuf -V12.3; a.out
            4            2            2            2

Or add an explicit interface:

% cat test.cuf 

attributes(global) subroutine addkernel(A)
integer,intent(inout)::A(:,:)
integer::b1(1:2)
b1=2
A(1,1)=A(1,1)+b1(1)
end subroutine


program main
use cudafor
implicit none

integer,device,allocatable::A_d(:,:)
integer,allocatable::A(:,:)
type(dim3)::dimGrid,dimBlock

interface
attributes(global) subroutine addkernel(A)
integer,intent(inout)::A(:,:)
end subroutine addkernel
end interface

dimGrid=dim3(1,1,1)
dimBlock=dim3(1,1,1)
allocate(A(2,2))
A=2
allocate(A_d(2,2))
A_d=A
call addkernel<<<dimGrid>>>(A_d)
A=A_d
print *, A
end program main 
% pgf90 test.cuf -V12.3 ; a.out
            4            2            2            2

Hope this helps,
Mat

Hi, Mat. Actually here is my whole code, but the problem is still same. Since my code is very simple, i have checked it many timeS and just cannot find what is wrong.
My cuda driver version and runtime version are both 3.0, and pgi is 11.8

module mod_cuda_test
use cudafor
implicit none
integer,device,allocatable::A_d(:,:,:,:)
integer,constant::b=3
contains
attributes(device) subroutine addkernel2(b1)
implicit none
integer,intent(in),value::b1
integer::c
c=b1+b
end subroutine addkernel2

attributes(global) subroutine addkernel(A)
implicit none
integer,device,intent(inout)::A(:,:,:,:)
integer::b1(1:2),b2(1:2)
b1=2
b2=3
A(1,1,1,1)=A(1,1,1,1)+b1(1)
end subroutine addkernel

end module mod_cuda_test

program test_kernel
use cudafor
use mod_cuda_test
implicit none
integer,pointer::A(:,:,:,:)
type(dim3)::dimGrid,dimBlock

dimGrid=dim3(1,1,1)
dimBlock=dim3(1,1,1)

allocate(A(2,2,2,2))
A=2
allocate(A_d(2,2,2,2))

A_d=A
write(,),"before call kernel "
call addkernel<<>>(A_d)
write(,),"after call kernel "
A=A_d
write(,),“after transfer data from device to host”
write(,),A

end program test_kernel

Hi Steve,

Your code works fine for me, so I suspect that the old CUDA driver may be at fault. Can you try updating it?

  • Mat
% pgf90 test.cuf -V11.8 -fast 
% a.out
 before call kernel 
 after call kernel 
 after transfer data from device to host
            4            2            2            2            2            2 
            2            2            2            2            2            2 
            2            2            2            2
% pgaccelinfo
CUDA Driver Version:           4010
NVRM version: NVIDIA UNIX x86_64 Kernel Module  285.05.33  Thu Jan 19 14:07:02 PST 2012
... cut