c++ Project file management using CUDA Where to put the CUDA code?


I am trying to write some libs using CUDA architecture. However program will use GP-GPU only if it will be found on computer.

My question is how to write an application that is a normal C++ program, but uses CUDA if it is possible.

All tutorials and info are using .cu files. But I am just using lets say msvcc and I place my code in *.h and *.cpp files.

Lets say i have a such a code:

int CountSomething(*arg1,*arg2,*result)


  if (HasCUDA()) return CudaFunction(*arg1,*arg2,*result);

  else return CPUFunction(*arg1,*arg2,*result); 


How sould the project look like? Should I make header and .cu file for CUDA functions, and include the header in my host code?

Can I call CUDA function from a normal *.h or *.cpp file?

Should I make *.dll’s with CUDA code, and then call it from the host code?

What is the proper/best way of solving this?

Thanks for any help,

Best Regards

like that

#include <stdio.h>  

 #include <cuda.h>  

 #include <time.h>

#include <math.h>

 #include "cutil_inline.h"

 // Kernel that executes on the CUDA device  

 static unsigned __int64 start_tics;

 __global__ void square_array(float *a, int N,int in,int ss)  


    float  Cnter = 0.0;

    int z =0;

           int idx = blockIdx.x * blockDim.x + threadIdx.x;  

           int idy = blockIdx.y * blockDim.y + threadIdx.y;  

           int id = idx+65536*idy;;

          if (  id<N)



                   Cnter = a[z] + a[z+in];  

                   a[z+in] = a[z] - a[z+in];

                   a[z] = Cnter;



// main routine that executes on the host  

 int main(void)  


   float *memoirecpu1, *memoiregraphique1;  // Pointer to host & device arrays  

   cudaEvent_t start, stop;

   int S = 0;

   int N=1;

for (S=1 ;S<27;S=S+1)





   size_t size = N * sizeof(float);  

   memoirecpu1 = (float *)malloc(size);        // Allocate array on host  

   // Initialize host array and copy it to CUDA device  

   for (int i=0; i<N; i++)

   { memoirecpu1[i] = (float)i;  



    cutilSafeCall( cudaEventCreate(&start) );

    cutilSafeCall( cudaEventCreate(&stop)  );

    unsigned int timer;

    cutilCheckError(  cutCreateTimer(&timer)  );

    cutilCheckError(  cutResetTimer(timer)    );

    cutilSafeCall( cudaThreadSynchronize() );

    float gpu_time = 0.0f;

    cutilCheckError( cutStartTimer(timer) );


  cudaMalloc((void **) &memoiregraphique1, size);   // Allocate array on device  

  cudaMemcpy(memoiregraphique1, memoirecpu1, size, cudaMemcpyHostToDevice);  

int in =1;

int ss=2;


for (int bou=0;bou<S;bou=bou+1)


    cudaEventRecord(start, 0);

if (bou<4)    square_array <<< dim3(4096,34,1),dim3(16,32,1) >>> (memoiregraphique1, N,in,ss);  // i win 10 ms now only 95 ms   

if (bou>3)   square_array <<< dim3(1024,66,1),dim3(64,8,1) >>> (memoiregraphique1, N,in,ss);    // only 14 ms

        cudaEventRecord(stop, 0);

         unsigned long int counter=0;

    while( cudaEventQuery(stop) == cudaErrorNotReady )




  cutilSafeCall( cudaEventElapsedTime(&gpu_time, start, stop) );        

    printf("time spent executing by the GPU: %.2f\n", gpu_time);




   cudaMemcpy(memoirecpu1, memoiregraphique1, sizeof(float)*N, cudaMemcpyDeviceToHost);  


    cutilCheckError( cutStopTimer(timer) );

// have CPU do some work while waiting for stage 1 to finish

    unsigned long int counter=0;

    while( cudaEventQuery(stop) == cudaErrorNotReady )




    cutilSafeCall( cudaEventElapsedTime(&gpu_time, start, stop) );

// print the cpu and gpu times

    printf("time spent executing by the GPU: %.2f\n", gpu_time);

    printf("time spent by CPU in CUDA calls: %.2f\n", cutGetTimerValue(timer) );

    printf("CPU executed %d iterations while waiting for GPU to finish\n", counter);

for (int i=0; i<20; i=i+1) 


   printf("%d %f\n", i, memoirecpu1[i]);  


   free(memoirecpu1); cudaFree(memoiregraphique1);  


Easiest way it to have your cuda code in your .cu file(s), and call it from a .cpp file. It’s not uncommon while using the runtime API to have wrapper functions for your .cu functions in a .cpp file and have those serve as the entry point to your cuda function(s). I would advise avoiding making cuda dll’s and calling them from host code, unless you have no other alternatives

Thanks, using cu files with c++ wrappers for kernel functions works just fine :w00twave:

I only wonder why I cannot rate +1 your answers, because I get error message :down: