Hi,

I am porting an aeroacoustics code that is using MPI/OpenMP/Accelerator to use MPI/OpenMP/CUDA Fortran. To simplify the process, I am using the CUF kernel loop directive in the following subroutine:

```
subroutine cuda_surfaces (surface_accel,surf_x_ip,surf_x_im,surf_y_jp,surf_y_jm)
use accel
use cudafor
double precision, dimension (0:i_max_accel+1,0:j_max_accel+1,4), intent (in), device :: surface_accel
double precision, dimension (0:i_max_accel+1,0:j_max_accel+1), intent (out), device :: surf_x_ip,surf_x_im,surf_y_jp,surf_y_jm
integer :: i,j
!$cuf kernel do(2) <<< (*,*), (n_cores_smp,nu_smps_gpu) >>>
do j=0,j_max_accel+1
do i=0,i_max_accel+1 !line 2161:cudaLaunch returned status 4
surf_x_ip(i,j)=surface_accel(i,j,1)
surf_x_im(i,j)=surface_accel(i,j,2)
surf_y_jp(i,j)=surface_accel(i,j,3)
surf_y_jm(i,j)=surface_accel(i,j,4)
end do
end do
end subroutine cuda_surfaces
```

where the **accel** module is given by:

```
module accel
integer, parameter :: n_cores_smp=32
integer, parameter :: nu_smps_gpu=14
integer, parameter :: i_max_accel=2336
integer, parameter :: j_max_accel=1624
end module accel
```

Subroutine **cuda_surfaces** is called by subroutine **euler_solver**, where the related parts are:

```
subroutine euler_solver (....)
....
use accel
use cudafor
....
double precision, dimension (0:i_max_accel+1,0:j_max_accel+1,4) :: surface_host
double precision, dimension (0:i_max_accel+1,0:j_max_accel+1,4), device :: surface_accel
double precision, dimension (0:i_max_accel+1,0:j_max_accel+1), device :: surf_x_ip,surf_x_im,surf_y_jp,surf_y_jm,vol
....
call mpi_barrier (mpi_comm_world,ierr)
....
!setting the number of threads for the following openmp parallel region equal to the number of gpus
call omp_set_num_threads(n_threads_gpus)
....
!$omp parallel, private (surface_accel,surf_x_ip,surf_x_im,surf_y_jp,surf_y_jm)
....
devnum=omp_get_thread_num()
istat=cudaSetDevice(devnum)
....
!$omp do
do l_block=1,l_max_block !block size is i_max_accel*j_max_accel
.....
surface_accel=surface_host
....
call cuda_surfaces (surface_accel,surf_x_ip,surf_x_im,surf_y_jp,surf_y_jm)
....
end do
!$omp end do
!$omp end parallel
....
call mpi_barrier (mpi_comm_world,ierr)
return
end subroutine euler_solver
```

When I try to run the code, I get the following error message:

**line 2161: cudaLaunch returned status 4: unspecified launch failure**

where line 2161 is shown commented in subroutine **cuda_surfaces** and corresponds to the innermost loop. The compilation message related to this subroutine is:

```
cuda_surfaces:
2161, CUDA kernel generated
2161, !$cuf kernel do <<< (*,*), (32,14) >>>
```

Any help will be appreciated.

Regards,

Roberto