CUDA with Fortran Interface performance.

Hello ! I’m novice in CUDA development. I have written a simple code to do some array computations on CUDA. All computations are just addition and multiplication of one dimensional array elements between each other and all this operations are independent. Main code originally in Fortran, so I took piece from FORTCUDA (http://sourceforge.net/projects/fortcuda/) project interface:

module cuda_runtime 

    use, intrinsic :: ISO_C_BINDING
    use cuda_unknowns

    enum, bind(C) !:: cudaError
      enumerator :: cudaSuccess=0
      enumerator :: cudaErrorMissingConfiguration=1
      enumerator :: cudaErrorMemoryAllocation=2
      enumerator :: cudaErrorInitializationError=3
      enumerator :: cudaErrorLaunchFailure=4
      enumerator :: cudaErrorPriorLaunchFailure=5
      enumerator :: cudaErrorLaunchTimeout=6
      enumerator :: cudaErrorLaunchOutOfResources=7
      enumerator :: cudaErrorInvalidDeviceFunction=8
      enumerator :: cudaErrorInvalidConfiguration=9
      enumerator :: cudaErrorInvalidDevice=10
      enumerator :: cudaErrorInvalidValue=11
      enumerator :: cudaErrorInvalidPitchValue=12
      enumerator :: cudaErrorInvalidSymbol=13
      enumerator :: cudaErrorMapBufferObjectFailed=14
      enumerator :: cudaErrorUnmapBufferObjectFailed=15
      enumerator :: cudaErrorInvalidHostPointer=16
      enumerator :: cudaErrorInvalidDevicePointer=17
      enumerator :: cudaErrorInvalidTexture=18
      enumerator :: cudaErrorInvalidTextureBinding=19
      enumerator :: cudaErrorInvalidChannelDescriptor=20
      enumerator :: cudaErrorInvalidMemcpyDirection=21
      enumerator :: cudaErrorAddressOfConstant=22
      enumerator :: cudaErrorTextureFetchFailed=23
      enumerator :: cudaErrorTextureNotBound=24
      enumerator :: cudaErrorSynchronizationError=25
      enumerator :: cudaErrorInvalidFilterSetting=26
      enumerator :: cudaErrorInvalidNormSetting=27
      enumerator :: cudaErrorMixedDeviceExecution=28
      enumerator :: cudaErrorCudartUnloading=29
      enumerator :: cudaErrorUnknown=30
      enumerator :: cudaErrorNotYetImplemented=31
      enumerator :: cudaErrorMemoryValueTooLarge=32
      enumerator :: cudaErrorInvalidResourceHandle=33
      enumerator :: cudaErrorNotReady=34
      enumerator :: cudaErrorInsufficientDriver=35
      enumerator :: cudaErrorSetOnActiveProcess=36
      enumerator :: cudaErrorInvalidSurface=37
      enumerator :: cudaErrorNoDevice=38
      enumerator :: cudaErrorECCUncorrectable=39
      enumerator :: cudaErrorSharedObjectSymbolNotFound=40
      enumerator :: cudaErrorSharedObjectInitFailed=41
      enumerator :: cudaErrorUnsupportedLimit=42
      enumerator :: cudaErrorDuplicateVariableName=43
      enumerator :: cudaErrorDuplicateTextureName=44
      enumerator :: cudaErrorDuplicateSurfaceName=45
      enumerator :: cudaErrorDevicesUnavailable=46
      enumerator :: cudaErrorStartupFailure=127
      enumerator :: cudaErrorApiFailureBase=10000
    end enum ! cudaError

    enum, bind(C) !:: cudaMemcpyKind
      enumerator :: cudaMemcpyHostToHost=0
      enumerator :: cudaMemcpyHostToDevice=1
      enumerator :: cudaMemcpyDeviceToHost=2
      enumerator :: cudaMemcpyDeviceToDevice=3
    end enum ! cudaMemcpyKind   

    interface ! [['cudaError_t', None], 'cudaMemcpy', [['void', '*', 'dst'], ['const', 'void', '*', 'src'], ['size_t', None, 'count'], ['enum', 'cudaMemcpyKind', None, 'kind']]]
      function cudaMemcpy(dst,src,count,kind_arg) result( res ) bind(C, name="cudaMemcpy")
        use, intrinsic :: ISO_C_BINDING
        import cudaSuccess
        import cudaMemcpyHostToHost
        implicit none
        type(c_ptr), value :: dst
        type(c_ptr), value :: src
        integer(c_int), value :: count
        integer (KIND(cudaMemcpyHostToHost)), value :: kind_arg
        integer (KIND(cudaSuccess)) :: res
      end function cudaMemcpy
    end interface

    interface ! [['cudaError_t', None], 'cudaMalloc', [['void', '**', 'devPtr'], ['size_t', None, 'size']]]
      function cudaMalloc(devPtr,size) result( res ) bind(C, name="cudaMalloc")
        use, intrinsic :: ISO_C_BINDING
        import cudaSuccess
        implicit none
        type(c_ptr) :: devPtr
        integer(c_int), value :: size
        integer (KIND(cudaSuccess)) :: res
      end function cudaMalloc
    end interface

    interface ! [['cudaError_t', None], 'cudaFree', [['void', '*', 'devPtr']]]
      function cudaFree(devPtr) result( res ) bind(C, name="cudaFree")
        use, intrinsic :: ISO_C_BINDING
        import cudaSuccess
        implicit none
        type(c_ptr), value :: devPtr
        integer (KIND(cudaSuccess)) :: res
      end function cudaFree
    end interface

    interface ! [['cudaError_t', None], 'cudaSetDevice', [['int', None, 'device']]]
      function cudaSetDevice(device) result( res ) bind(C, name="cudaSetDevice")
        use, intrinsic :: ISO_C_BINDING
        import cudaSuccess
        implicit none
        integer(c_int), value :: device
        integer (KIND(cudaSuccess)) :: res
      end function cudaSetDevice
    end interface

end module

CUDA kernel is called from Fortran subroutine:

SUBROUTINE CUDA_SUB(cut_X, cut_size, var_num, constant1)

    use, intrinsic :: ISO_C_BINDING
    use cuda_runtime

    integer, parameter :: fp_kind = kind(0.0d0) ! Double precision

    integer cut_size 
    real(fp_kind), dimension(cut_size), target :: cut_X
    integer(c_int) :: var_num 
    real(fp_kind) :: constant1

    type (c_ptr) :: cptr_X
    real(fp_kind), allocatable, target :: d_X(:)
    real(fp_kind) ,pointer, dimension (:) :: fptr_X 
    integer(c_int) :: buf_size, cells_num 
    integer :: err 

    cells_num = cut_size/KL + 1

    allocate(d_X(cut_size))

    buf_size = (cut_size+KL)*fp_kind

    err = cudaSetDevice(0)

    err = cudaMalloc(cptr_X, buf_size)
    err = cudaMemcpy(cptr_X, c_loc(cut_X), buf_size, cudaMemCpyHostToDevice)

    call cudacalc(fptr_X, cells_num, var_num,constant1)

    err = cudaMemcpy(c_loc(cut_X), cptr_X, buf_size, cudaMemCpyDeviceToHost)
    err = cudaFree(cptr_X)

END

And there is simple CUDA kernel:

#include "stdio.h"

__global__ void Loop(double *X, int CellsNum, int VarNum,const double constant1)
{

    int idx = threadIdx.x+blockDim.x*blockIdx.x;
    int i = (index+1)*VarNum ;
    double exp1,exp2,exp3,exp4 ; 

    if(idx<CellsNum-2) {

        exp1=double(0.5)*(X[i+6+VarNum]+X[i+6])+X[i+10] ;
        exp2=double(0.5)*(X[i+8+VarNum]+X[i+8]) ;

        if(i==0) {
            printf("%e %e",exp1,exp2) ; 
        }

        exp3=X[i+11]-constant1*(exp1*exp2)/X[i+5] ;

        exp4=constant1*(X[i+9]*exp1-X[i+9-VarNum]*exp2)/X[i+5] ;

        X[i+12]=exp3+exp4;
    }
}

extern "C" void cudacalc_(double *a, int* N1, int* N2, double* N3)
   {
    int Cells_Num = *N1;
    int Var_Num = *N2; 
    double constant1 = *N3;

    Loop<<<1,Cells_Num>>>(a,Cells_Num,Var_Num,constant1);

   }

Performance of this computations is quite low, even in comparison with singe CPU, but i cannot understand why. Another strange thing here is that than I remove printf from Loop subroutine it stops working properly and produce NaNs.

Any help will be appreciated, thank you in advance!