#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <math.h>
#include "cutil_inline.h"
// Kernel that executes on the CUDA device
static unsigned __int64 start_tics;
__global__ void square_array(float *a, int N,int in,int ss)
{
float Cnter = 0.0;
int z =0;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int id = idx+65536*idy;;
if ( id<N)
{
z=int(id/in)*in+id;
Cnter = a[z] + a[z+in];
a[z+in] = a[z] - a[z+in];
a[z] = Cnter;
}
}
// main routine that executes on the host
int main(void)
{
float *memoirecpu1, *memoiregraphique1; // Pointer to host & device arrays
cudaEvent_t start, stop;
int S = 0;
int N=1;
for (S=1 ;S<27;S=S+1)
{
N=N*2;
}
S=S-1;
size_t size = N * sizeof(float);
memoirecpu1 = (float *)malloc(size); // Allocate array on host
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++)
{ memoirecpu1[i] = (float)i;
}
N=N;
cutilSafeCall( cudaEventCreate(&start) );
cutilSafeCall( cudaEventCreate(&stop) );
unsigned int timer;
cutilCheckError( cutCreateTimer(&timer) );
cutilCheckError( cutResetTimer(timer) );
cutilSafeCall( cudaThreadSynchronize() );
float gpu_time = 0.0f;
cutilCheckError( cutStartTimer(timer) );
//---------------------------
cudaMalloc((void **) &memoiregraphique1, size); // Allocate array on device
cudaMemcpy(memoiregraphique1, memoirecpu1, size, cudaMemcpyHostToDevice);
int in =1;
int ss=2;
N=N/2;
for (int bou=0;bou<S;bou=bou+1)
{
cudaEventRecord(start, 0);
if (bou<4) square_array <<< dim3(4096,34,1),dim3(16,32,1) >>> (memoiregraphique1, N,in,ss); // i win 10 ms now only 95 ms
if (bou>3) square_array <<< dim3(1024,66,1),dim3(64,8,1) >>> (memoiregraphique1, N,in,ss); // only 14 ms
cudaEventRecord(stop, 0);
unsigned long int counter=0;
while( cudaEventQuery(stop) == cudaErrorNotReady )
{
counter++;
}
cutilSafeCall( cudaEventElapsedTime(&gpu_time, start, stop) );
printf("time spent executing by the GPU: %.2f\n", gpu_time);
in=in*2;
ss=ss*2;
}
cudaMemcpy(memoirecpu1, memoiregraphique1, sizeof(float)*N, cudaMemcpyDeviceToHost);
//------------------------------
cutilCheckError( cutStopTimer(timer) );
// have CPU do some work while waiting for stage 1 to finish
unsigned long int counter=0;
while( cudaEventQuery(stop) == cudaErrorNotReady )
{
counter++;
}
cutilSafeCall( cudaEventElapsedTime(&gpu_time, start, stop) );
// print the cpu and gpu times
printf("time spent executing by the GPU: %.2f\n", gpu_time);
printf("time spent by CPU in CUDA calls: %.2f\n", cutGetTimerValue(timer) );
printf("CPU executed %d iterations while waiting for GPU to finish\n", counter);
for (int i=0; i<20; i=i+1)
{
printf("%d %f\n", i, memoirecpu1[i]);
}
free(memoirecpu1); cudaFree(memoiregraphique1);
}
Easiest way it to have your cuda code in your .cu file(s), and call it from a .cpp file. It’s not uncommon while using the runtime API to have wrapper functions for your .cu functions in a .cpp file and have those serve as the entry point to your cuda function(s). I would advise avoiding making cuda dll’s and calling them from host code, unless you have no other alternatives