I have a problem with Fortran → CUDA C interop. My kernels get executed fine, but whenever Fortran calls STOP I get a segfault. I tried manually calling a cudaDeviceReset - whenever this call gets called in any of my C functions, they immediately segfault upon entry, I don’t even get them to print something. cuda-memcheck as well as debugging with ddt will not lead to the segfault, so I’m dealing with a Heisenbug. Can you reproduce this?
Output
test done
Warning: ieee_inexact is signaling
FORTRAN STOP
Segmentation fault
Fortran, compiled/linked with pgf90 15.3
module simple_weather
use iso_c_binding
implicit none
interface
subroutine diffuse_c(thermal_energy_updated, thermal_energy, nx, ny, nz) bind(C, name="diffuse_c")
use iso_c_binding,only: c_int, c_double
integer(c_int),value :: nx, ny, nz
real(c_double), dimension(0:nx+1,0:ny+1,nz) :: thermal_energy_updated, thermal_energy
end subroutine diffuse_c
end interface
contains
subroutine testrun_cuda_kernels()
real(8), allocatable, dimension(:,:,:) :: test_data, updated_test_data
integer(4) :: nx, ny, nz
nx = 1
ny = 1
nz = 3
allocate(test_data(nz,0:nx+1,0:ny+1))
allocate(updated_test_data(nz,0:nx+1,0:ny+1))
test_data(:,:,:) = 0.0d0
updated_test_data(:,:,:) = 0.0d0
!$acc data copyin(test_data) copy(updated_test_data)
!$acc host_data use_device(test_data,updated_test_data)
call diffuse_c(updated_test_data, test_data, nx, ny, nz)
!$acc end host_data
!$acc end data
end subroutine
end module
program main
use simple_weather
implicit none
call testrun_cuda_kernels
write(6,*) "test done"
stop
end program main
CUDA C Code, compiled with nvcc 6.5.12
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <assert.h>
#define ACCESS_3D(pointer, i, j, k) pointer[(k) + (i) * nz + (j) * nz * (nx + 2)]
typedef const double * const __restrict__ IN;
typedef double * const __restrict__ OUT;
typedef const int SIZE;
extern "C" {
void diffuse_c(OUT thermal_energy_updated, IN thermal_energy, SIZE nx, SIZE ny, SIZE nz);
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
__global__ void diffuse_kernel(
OUT thermal_energy_updated, IN thermal_energy, SIZE nx, SIZE ny, SIZE nz, const double scale_0, const double scale_rest
) {
int i = blockIdx.x*blockDim.x + threadIdx.x + 1;
int j = blockIdx.y*blockDim.y + threadIdx.y + 1;
if (i > nx || j > ny) {
return;
}
for (int k = 1; k < nz-1; k++) {
double updated = scale_0 * ACCESS_3D(thermal_energy,i,j,k) + scale_rest * (
ACCESS_3D(thermal_energy,i-1,j,k) + ACCESS_3D(thermal_energy,i+1,j,k) +
ACCESS_3D(thermal_energy,i,j-1,k) + ACCESS_3D(thermal_energy,i,j+1,k) +
ACCESS_3D(thermal_energy,i,j,k-1) + ACCESS_3D(thermal_energy,i,j,k+1)
);
ACCESS_3D(thermal_energy_updated,i,j,k) = updated;
}
}
void diffuse_c(OUT thermal_energy_updated, IN thermal_energy, SIZE nx, SIZE ny, SIZE nz) {
//setup
int block_size_x = 32, block_size_y = 16;
dim3 threads(block_size_x, block_size_y, 1);
double diffusion_velocity = 0.13;
//kernel launch for the inner region
dim3 grid_ij_inner(max(nx/block_size_x,1), max(ny/block_size_y,1), 1);
diffuse_kernel<<<grid_ij_inner, threads>>>(
thermal_energy_updated, thermal_energy, nx, ny, nz, 1.0 - 6.0 * diffusion_velocity, diffusion_velocity
);
gpuErrchk(cudaDeviceSynchronize());
//cudaDeviceReset(); //THIS LEADS TO SEGFAULT EVEN *BEFORE* CALLING ANY OF THE ABOVE CODE.
}