slow cuda computing time (need idea for my cuda coding)

Hello I am a beginner in cuda programming. this my code.

#include “book.h”
#include <conio.h>
#include
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <math.h>
#include
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#define pop 1000
#define gen 1000
#define p gen*pop

using namespace std;

//random generator di GPU
device float generate( curandState *globalState, int i)
{
//int ind = threadIdx.x;
curandState localState = globalState[i];
float RANDOM = curand_uniform( &localState );
globalState[i] = localState;
return RANDOM;
}

global void setup_kernel ( curandState * state, unsigned long seed )
{
int id = threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
global void crossover(int *child, int parent, curandState globalState, int *x, int y)
{
int tid=(blockIdx.x
blockDim.x)+threadIdx.x;
int distance1,distance2;
int tmp;
if(tid<pop-1)
{

    for(int j=0;j<gen;j++)
    {

        if (j == 0)
        {
            child[(tid*gen)+j]= generate(globalState,tid) * gen;//ini hassilnya tdak random bagus
        }
        else
        {

            distance1=sqrt((float)( pow ((double)(x[parent[(tid*gen)+j]]-x[child[(tid*gen)+(j-1)]]),2.0)+pow((double)(y[parent[(tid*gen)+j]]-y[child[(tid*gen)+(j-1)]]),2.0)));
            distance2=sqrt((float)( pow ((double)(x[parent[((tid+1)*gen)+j]]-x[child[(tid*gen)+(j-1)]]),2.0)+pow((double)(y[parent[((tid+1)*gen)+j]]-y[child[(tid*gen)+(j-1)]]),2.0)));             
            //anak[(tid*gen)+j]=tid;//thread dimana komputasi jalan
            if (distance1<distance2)
            {
                tmp=parent[(tid*gen)+j];
            }

            tmp=parent[((tid+1)*gen)+j];

            bool same;
            do
            {
                same=false;
                for(int i=0;i<j;i++)
                {
                    if (child[(tid*gen)+i] == tmp)
                    {
                        same=true;
                        tmp=generate(globalState, tid)*gen ;
                    }
                }child[(tid*gen)+j]=tmp;
            }while(same);

        }
    }
}
else
{
    for(int j=0;j<gen;j++)
    {
        if (j == 0)
        {
            child[(tid*gen)+j]= generate(globalState,tid) * gen;
        }
        else
        {
            distance1=sqrt((float)(pow((double)(x[parent[(tid*gen)+j]]-x[child[(tid*gen)+(j-1)]]),2.0))+pow((double)(y[parent[(tid*gen)+j]]-y[child[(tid*gen)+(j-1)]]),2.0));
            distance2=sqrt((float)(pow((double)(x[parent[j]]-x[child[(tid*gen)+(j-1)]]),2.0))+pow((double)(y[parent[j]]-y[child[(tid*gen)+(j-1)]]),2.0));

            if (distance1<distance2)
            {
                tmp=parent[(tid*gen)+j];
            }

            tmp=parent[j];

            bool samme;
            do
            {
                samme=false;
                for(int i=0;i<j;i++)
                {
                    if (child[(tid*gen)+i] == tmp)
                    {
                        samme=true;
                        tmp=generate(globalState, tid)*gen ;
                    }
                }child[(tid*gen)+j]=tmp;
            }while(samme);

        }
    }

}

}
int main (void)
{
srand(time(NULL));
float time1;
int x[gen],y[gen];
int dev_x,dev_y;
double MAX = 100.;
int
parent=new int [p];
int
child=new int [p];
int dev_parent;
int dev_child;
for (int i=0;i<gen;i++)
{
float unirand = ((float)rand()/(float)RAND_MAX);
x[i] = MAX
unirand;
}
for (int i=0;i<gen;i++)
{
float unirand = ((float)rand()/(float)RAND_MAX);
y[i] = MAX
unirand;
}
HANDLE_ERROR( cudaMalloc( (void**)&dev_x, gen * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_y, gen * sizeof(int) ) );
HANDLE_ERROR( cudaMemcpy( dev_x, x, gen * sizeof(float), cudaMemcpyHostToDevice ) );
HANDLE_ERROR( cudaMemcpy( dev_y, y, gen * sizeof(float), cudaMemcpyHostToDevice ) );
bool check;
for (int i=0;i<pop;i++)
{
for (int j=0;j<gen;j++)
{
do
{
check = false;
parent[ ((igen)+j) ] = rand()%gen;
for (int k=0;k<j;k++)
{
if (parent[ ((i
gen)+j) ] == parent[ ((i*gen)+k) ])
{
check = true;
break;
}
}

        }
        while (check);

    }
}
HANDLE_ERROR( cudaMalloc( (void**)&dev_parent, p * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_child, p * sizeof(int) ) );
HANDLE_ERROR( cudaMemcpy (dev_parent, parent, p * sizeof(int), cudaMemcpyHostToDevice ) );

curandState* devStates;
HANDLE_ERROR(cudaMalloc ( &devStates, pop * sizeof( curandState ) ));
cudaEvent_t start, stop;
HANDLE_ERROR( cudaEventCreate(&start) );
HANDLE_ERROR( cudaEventCreate(&stop) );
HANDLE_ERROR( cudaEventRecord(start, 0) );
setup_kernel <<< pop, 1 >>> ( devStates, unsigned(time(NULL)) );
crossover<<<pop, 1>>>(dev_child,dev_parent,devStates,dev_x,dev_y);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaEventRecord(stop, 0) );
HANDLE_ERROR( cudaEventSynchronize(stop) );
HANDLE_ERROR( cudaEventElapsedTime(&time1, start, stop) );
HANDLE_ERROR( cudaMemcpy ( child, dev_child, p * sizeof(int),cudaMemcpyDeviceToHost ) );
printf("crossover:  %3.1f ms \n", time1);
cudaFree( dev_child );
cudaFree( dev_parent );
cudaFree( devStates );
cudaFree( dev_x );
cudaFree( dev_y );
system("pause");
return 0;

}

execution time is 17943 ms in cuda. please help me to be faster. And sometimes if gen > 500 n pop > 500 error in line cudaMemcpyDeviceToHost. thankyou

i think your performance problem is here that you launch your kernel with bad parameters

for example :

setup_kernel <<< pop, 1 >>> ( devStates, unsigned(time(NULL)) );

you launch only 1 thread by block.

Blocks physically correspond to multiprocessor, you only launch 1 threads, they are able to support many more, depending on your GPU.

try using parameters like this :

dim3 nbThreads(128);
dim3 nbBlock((pop / nbThreads.x)+1)
setup_kernel <<< nbBlocks, nbThreads >>> ( devStates, unsigned(time(NULL)) );

You will launch blocks of 128Threads (loading and storing values in a coalescing manner)

P.S. See NVIDIA Cuda programming guide to understand how blocks/threads organisation, it’s well documented.