Hello,
I am trying to write a code which calling CUDA C code:
I found an olde example on the internet. All the CUDA operations (allocation, cp from H to D, kernel, cp from D to H) are one cu file and the fortran code is just calling one function:
__global__ void vect_add(double *a, int N)
{
int idx = threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
// function called from main fortran program
extern "C" void kernel_wrapper_(double *a, int *Np)
{
double *a_d; // declare GPU vector copies
int blocks = 1; // uses 1 block of
int N = *Np; // N threads on GPU
// Allocate memory on GPU
cudaMalloc( (void **)&a_d, sizeof(double) * N );
// copy vectors from CPU to GPU
cudaMemcpy( a_d, a, sizeof(double) * N, cudaMemcpyHostToDevice );
// call function on GPU
vect_add<<< blocks, N >>>( a_d, N);
// copy vectors back from GPU to CPU
cudaMemcpy( a, a_d, sizeof(double) * N, cudaMemcpyDeviceToHost );
// free GPU memory
cudaFree(a_d);
printf("Test 00 \n");
return;
}
Th fortran code is very simple:
IMPLICIT NONE
integer*4 :: i
integer*4, parameter :: N=8, tpb=8
real*8, Dimension(N) :: a
DO i=1,N
a(i)=i*1.0
END DO
print *, ' a = ', (a(i), i=1,N)
CALL kernel_wrapper(a, N)
print *, 'a**2=', (a(i), i=1,N)
The cu file is compiled with -c
option and then the final compilation is done with gfortran
. This code works without problems,.
I tried to break the code for each operation a wrapper, one for allocation one for cp from H to D, one for the kernel , and finally one for the cp from D to H:
extern "C" void cuda_malloc_(double *a_d, int *Np)
{
int N = *Np; // N threads on GPU
// Allocate memory on GPU
cudaMalloc( (void **)&a_d, sizeof(double) * N );
printf("Test 01 \n");
return;
}
extern "C" void cuda_cpy_htod_(double *a, double *a_d, int *Np)
{
int N = *Np; // N threads on GPU
cudaMemcpy( a_d, a, sizeof(double) * N, cudaMemcpyHostToDevice );
printf("Test 02 \n");
return;
}
extern "C" void cuda_cpy_dtoh_(double *a_d, double *a ,int *Np)
{
int N = *Np; // N threads on GPU
cudaMemcpy( a, a_d, sizeof(double) * N, cudaMemcpyDeviceToHost );
printf("Test 04 \n");
return;
}
extern "C" void cuda_free_(double *a_d)
{
cudaFree(a_d);
printf("Test 05 \n");
return;
}
__global__ void vect_square(double *a, int N)
{
int idx = threadIdx.x+blockIdx.x*gridDim.x;
printf("%lf n", a[idx] * a[idx]);
if (idx<N) a[idx] = a[idx] * a[idx];
}
extern "C" void dev_square_(double *a_d, int *Np, int *tpb)
{
int N = *Np; // N threads on GPU
int threads= *tpb;
int blocks = (N+threads-1)/threads;
// call function on GPU
vect_square<<< blocks, threads>>>(a_d, N);
printf("Test 04 \n");
return;
}
In the fortran code I do this:
use iso_c_binding
IMPLICIT NONE
integer*4 :: i
integer*4, parameter :: N=8, tpb=8
real*8, Dimension(N) :: a
type(c_ptr) :: a_d
DO i=1,N
a(i)=i*1.0
END DO
call cuda_malloc(a_d, N)
call cuda_cpy_htod(a, a_d, N)
call dev_square(a_d, N, tpb)
DO i=1,N
a(i)=0.0
END DO
call cuda_cpy_dtoh(a_d, a, N)
call cuda_free(a_d)
print *, 'a**2=', (a(i), i=1,N)
In this case the wrappers appear to execute, but the CUDA part does does not seem to work.
What I am doing wrong here?