Hello ! I’m novice in CUDA development. I have written a simple code to do some array computations on CUDA. All computations are just addition and multiplication of one dimensional array elements between each other and all this operations are independent. Main code originally in Fortran, so I took piece from FORTCUDA (FortCUDA download | SourceForge.net) project interface:
module cuda_runtime
use, intrinsic :: ISO_C_BINDING
use cuda_unknowns
enum, bind(C) !:: cudaError
enumerator :: cudaSuccess=0
enumerator :: cudaErrorMissingConfiguration=1
enumerator :: cudaErrorMemoryAllocation=2
enumerator :: cudaErrorInitializationError=3
enumerator :: cudaErrorLaunchFailure=4
enumerator :: cudaErrorPriorLaunchFailure=5
enumerator :: cudaErrorLaunchTimeout=6
enumerator :: cudaErrorLaunchOutOfResources=7
enumerator :: cudaErrorInvalidDeviceFunction=8
enumerator :: cudaErrorInvalidConfiguration=9
enumerator :: cudaErrorInvalidDevice=10
enumerator :: cudaErrorInvalidValue=11
enumerator :: cudaErrorInvalidPitchValue=12
enumerator :: cudaErrorInvalidSymbol=13
enumerator :: cudaErrorMapBufferObjectFailed=14
enumerator :: cudaErrorUnmapBufferObjectFailed=15
enumerator :: cudaErrorInvalidHostPointer=16
enumerator :: cudaErrorInvalidDevicePointer=17
enumerator :: cudaErrorInvalidTexture=18
enumerator :: cudaErrorInvalidTextureBinding=19
enumerator :: cudaErrorInvalidChannelDescriptor=20
enumerator :: cudaErrorInvalidMemcpyDirection=21
enumerator :: cudaErrorAddressOfConstant=22
enumerator :: cudaErrorTextureFetchFailed=23
enumerator :: cudaErrorTextureNotBound=24
enumerator :: cudaErrorSynchronizationError=25
enumerator :: cudaErrorInvalidFilterSetting=26
enumerator :: cudaErrorInvalidNormSetting=27
enumerator :: cudaErrorMixedDeviceExecution=28
enumerator :: cudaErrorCudartUnloading=29
enumerator :: cudaErrorUnknown=30
enumerator :: cudaErrorNotYetImplemented=31
enumerator :: cudaErrorMemoryValueTooLarge=32
enumerator :: cudaErrorInvalidResourceHandle=33
enumerator :: cudaErrorNotReady=34
enumerator :: cudaErrorInsufficientDriver=35
enumerator :: cudaErrorSetOnActiveProcess=36
enumerator :: cudaErrorInvalidSurface=37
enumerator :: cudaErrorNoDevice=38
enumerator :: cudaErrorECCUncorrectable=39
enumerator :: cudaErrorSharedObjectSymbolNotFound=40
enumerator :: cudaErrorSharedObjectInitFailed=41
enumerator :: cudaErrorUnsupportedLimit=42
enumerator :: cudaErrorDuplicateVariableName=43
enumerator :: cudaErrorDuplicateTextureName=44
enumerator :: cudaErrorDuplicateSurfaceName=45
enumerator :: cudaErrorDevicesUnavailable=46
enumerator :: cudaErrorStartupFailure=127
enumerator :: cudaErrorApiFailureBase=10000
end enum ! cudaError
enum, bind(C) !:: cudaMemcpyKind
enumerator :: cudaMemcpyHostToHost=0
enumerator :: cudaMemcpyHostToDevice=1
enumerator :: cudaMemcpyDeviceToHost=2
enumerator :: cudaMemcpyDeviceToDevice=3
end enum ! cudaMemcpyKind
interface ! [['cudaError_t', None], 'cudaMemcpy', [['void', '*', 'dst'], ['const', 'void', '*', 'src'], ['size_t', None, 'count'], ['enum', 'cudaMemcpyKind', None, 'kind']]]
function cudaMemcpy(dst,src,count,kind_arg) result( res ) bind(C, name="cudaMemcpy")
use, intrinsic :: ISO_C_BINDING
import cudaSuccess
import cudaMemcpyHostToHost
implicit none
type(c_ptr), value :: dst
type(c_ptr), value :: src
integer(c_int), value :: count
integer (KIND(cudaMemcpyHostToHost)), value :: kind_arg
integer (KIND(cudaSuccess)) :: res
end function cudaMemcpy
end interface
interface ! [['cudaError_t', None], 'cudaMalloc', [['void', '**', 'devPtr'], ['size_t', None, 'size']]]
function cudaMalloc(devPtr,size) result( res ) bind(C, name="cudaMalloc")
use, intrinsic :: ISO_C_BINDING
import cudaSuccess
implicit none
type(c_ptr) :: devPtr
integer(c_int), value :: size
integer (KIND(cudaSuccess)) :: res
end function cudaMalloc
end interface
interface ! [['cudaError_t', None], 'cudaFree', [['void', '*', 'devPtr']]]
function cudaFree(devPtr) result( res ) bind(C, name="cudaFree")
use, intrinsic :: ISO_C_BINDING
import cudaSuccess
implicit none
type(c_ptr), value :: devPtr
integer (KIND(cudaSuccess)) :: res
end function cudaFree
end interface
interface ! [['cudaError_t', None], 'cudaSetDevice', [['int', None, 'device']]]
function cudaSetDevice(device) result( res ) bind(C, name="cudaSetDevice")
use, intrinsic :: ISO_C_BINDING
import cudaSuccess
implicit none
integer(c_int), value :: device
integer (KIND(cudaSuccess)) :: res
end function cudaSetDevice
end interface
end module
CUDA kernel is called from Fortran subroutine:
SUBROUTINE CUDA_SUB(cut_X, cut_size, var_num, constant1)
use, intrinsic :: ISO_C_BINDING
use cuda_runtime
integer, parameter :: fp_kind = kind(0.0d0) ! Double precision
integer cut_size
real(fp_kind), dimension(cut_size), target :: cut_X
integer(c_int) :: var_num
real(fp_kind) :: constant1
type (c_ptr) :: cptr_X
real(fp_kind), allocatable, target :: d_X(:)
real(fp_kind) ,pointer, dimension (:) :: fptr_X
integer(c_int) :: buf_size, cells_num
integer :: err
cells_num = cut_size/KL + 1
allocate(d_X(cut_size))
buf_size = (cut_size+KL)*fp_kind
err = cudaSetDevice(0)
err = cudaMalloc(cptr_X, buf_size)
err = cudaMemcpy(cptr_X, c_loc(cut_X), buf_size, cudaMemCpyHostToDevice)
call cudacalc(fptr_X, cells_num, var_num,constant1)
err = cudaMemcpy(c_loc(cut_X), cptr_X, buf_size, cudaMemCpyDeviceToHost)
err = cudaFree(cptr_X)
END
And there is simple CUDA kernel:
#include "stdio.h"
__global__ void Loop(double *X, int CellsNum, int VarNum,const double constant1)
{
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int i = (index+1)*VarNum ;
double exp1,exp2,exp3,exp4 ;
if(idx<CellsNum-2) {
exp1=double(0.5)*(X[i+6+VarNum]+X[i+6])+X[i+10] ;
exp2=double(0.5)*(X[i+8+VarNum]+X[i+8]) ;
if(i==0) {
printf("%e %e",exp1,exp2) ;
}
exp3=X[i+11]-constant1*(exp1*exp2)/X[i+5] ;
exp4=constant1*(X[i+9]*exp1-X[i+9-VarNum]*exp2)/X[i+5] ;
X[i+12]=exp3+exp4;
}
}
extern "C" void cudacalc_(double *a, int* N1, int* N2, double* N3)
{
int Cells_Num = *N1;
int Var_Num = *N2;
double constant1 = *N3;
Loop<<<1,Cells_Num>>>(a,Cells_Num,Var_Num,constant1);
}
Performance of this computations is quite low, even in comparison with singe CPU, but i cannot understand why. Another strange thing here is that than I remove printf from Loop subroutine it stops working properly and produce NaNs.
Any help will be appreciated, thank you in advance!