Does workload have an effect on speed of CUDA ?

================== de.cu ===========================

//=========================================================

//   Main entrance of the standard DE

//=========================================================

__constant__ int NP_CUDA = NP;	   // NP = 100  : Population size or the no. of individuals

 __constant__ int maxnfc_CUDA = 10 * 10000;

 __constant__ int maxrun_CUDA = MAXRUN;		// MAXRUN = 100;

 __constant__ int D_CUDA = DIM;	  // DIM = 101

 __constant__ double F_CUDA = 0.5;

 __constant__ double CR_CUDA = 0.9;

__device__ long		lseed = -1234;

__device__ int		gen = 0;			 //Generation

__device__ int		jrand;

__device__ int		imin = 0;		 // index to member with the best cost (obj. func. val.)

__device__ double	trial_cost;		// f. of the trial vector

__device__ double	mincost = 0;   // obj. func. val. of the best

__device__ double	cost[MAXPOP]; // objective function values (f.)

__device__ double	best[MAXDIM], bestit[MAXDIM];

__device__ double	*pswap_cu;

__device__ double	*x_d;

__device__ double	*curpop_cu[MAXPOP];

__device__ double	*nxtpop_cu[MAXPOP];

__device__ double	oldrand[55];

__device__ void reset_de(){

	int n, d;

	

	// Initialize population

	for ( n = 0; n < NP_CUDA; n++ ) {

	   for ( d = 0; d < D_CUDA; d++ ){

			curpop_cu[n][d] = randreal(low[d], hi[d]);		

	  }

		cost[n] = DEevaluate(curpop_cu[n]);   // Calculate Cost value

	}

	// Find the best individual at startup

	imin = 0;

	mincost = cost[0];

	for ( n = 1; n < NP_CUDA; n++ )  

	   if ( cost[n] < mincost ) {

		   mincost = cost[n];

		   imin = n;

	   }

	  

	dcopy_CUDA(D_CUDA, best,   curpop_cu[imin]);  // save best member ever

	dcopy_CUDA(D_CUDA, bestit, curpop_cu[imin]);  // save best member of generation

	  

}

__device__ void dcopy_CUDA (int D, double dest[], double source[]){

	int j;

	for (j=0; j<D; j++)

	  dest[j] = source[j];

}

__device__ void differential_mutation(int target)

{

	int n, L;

	int r1, r2, r3;	  // สุ่ม 3 vector

	

	do r1 = (int)(unirandom()*NP_CUDA); while (r1==target);  

	do r2 = (int)(unirandom()*NP_CUDA); while (r2==target || r2==r1); 

	do r3 = (int)(unirandom()*NP_CUDA); while (r3==target || r3==r1 || r3==r2);

	/*-------DE/rand/1/bin--------------------------------*/

	

		dcopy_CUDA(D_CUDA, x_d, curpop_cu[target]);

		

		n = (int)( unirandom() * D_CUDA );

		for ( L = 0; L < D_CUDA; L++ ) { /* perform D binomial trials */

			if ( unirandom() < CR_CUDA || 

				L == D_CUDA-1 ) /* change at least 1 parameter */

				x_d[n] = curpop_cu[r1][n] + F_CUDA*( curpop_cu[r2][n] - curpop_cu[r3][n] );

			n = ( n + 1 ) % D_CUDA;

		}

	  

		

}

__device__ double Sphere(double x[])

{

	int i;

	double res = 0.0;

	for ( i = 0; i < D_CUDA; i++ )

		res += x[i]*x[i];

	return res;

}

__device__ double DEevaluate(double x[])

{

	double ret = Sphere(x);

	

	nfc++;

	

	if ( nfc % logstep2file == 0 || nfc == 500 ) {

		lognfc[logc]  = nfc;

		logval[logc] += mincost;

		logc++;

	}

	// Onscreen logging

	

	if ( nfc == maxnfc_CUDA) {

		

		printf("%s\t%s\tR%02d\t%d\t%d\t%10.04g\n", 

			"RAND1B","F1-Sphere", nrun, gen, nfc, mincost);

		fflush(stdout);

	}

	return ret;

}

__global__ void deCUDA(double* mincost_CU)

{

	double temp[MAXRUN];

	int	target, d, i, m=0,n; 

	

	int tx = threadIdx.x;

	int ty = threadIdx.y;

	int index = (ty*16+tx) + ((blockIdx.x*64)+(blockIdx.y*256));

	

	if( index < 100){

		printf("thread ID = %d\n ",index);

	for(n = 0; n < NP_CUDA; n++){

		curpop_cu[n] = (double *)calloc(D_CUDA, sizeof(double));

		nxtpop_cu[n] = (double *)calloc(D_CUDA, sizeof(double));

	}

	randomize();

	x_d = (double *)calloc(D_CUDA, sizeof(double));

	

	reset_de();

	nfc = 0;

	logc = 0;

	

for ( gen = 1; nfc <= maxnfc_CUDA; gen++ ) {	 

		imin = 0;

	   

		for ( target = 0; target < NP_CUDA; target++ ) {	 

	differential_mutation(target); 

			// Trial mutation now in x[]. Check its boundaries

			for ( d = 0; d < D_CUDA; d++) {

				if ( x_d[d] < low[d] )

					x_d[d] = randreal ( low[d], 0.005*(hi[d]-low[d]) );

				else if ( x_d[d] > hi[d] ) {

					double space = 0.005 * ( hi[d] - low[d] );

					x_d[d] = randreal( hi[d] - space, space );

				}

			}

					

			

			trial_cost = DEevaluate(x_d);

				

										

			if (trial_cost <= cost[target]) { // Is objective function improved?

				cost[target] = trial_cost;

			   	

				dcopy_CUDA(D_CUDA, nxtpop_cu[target], x_d);

				if ( trial_cost < mincost ) { // Was this a new minimum?

					mincost = trial_cost;	 // If so, set mincost to new low.

					imin = target;

					dcopy_CUDA(D_CUDA, best, x_d);				   

				}

			} else { 

			  dcopy_CUDA(D_CUDA, nxtpop_cu[target], curpop_cu[target]);

				

			}			

		 }

		dcopy_CUDA(D_CUDA, bestit, best); // Save best member of the current gen.

		for ( i = 0; i < NP_CUDA; i++ ) {

			pswap_cu = curpop_cu[i];

			curpop_cu[i] = nxtpop_cu[i];

			nxtpop_cu[i] = pswap_cu; // swap pop. for next gen.

		}

		if(nfc == maxnfc_CUDA){

			temp[m] = mincost;

			m++;

		}		

	}

	nrun++;

	putchar('\n');

	

	for(int f=0; f<maxrun_CUDA; f++){

		mincost_CU[f] = temp[f];

	}

	}

}

void dcopy (int D, double dest[], double source[]){

	register int j;

	for (j=0; j<D; j++)

	  dest[j] = source[j];

}

/*------Constants for uniform_random()--------------------------------------------*/

#define IM1 2147483563

#define IM2 2147483399

#define AM (1.0/IM1)

#define IMM1 (IM1-1)

#define IA1 40014

#define IA2 40692

#define IQ1 53668

#define IQ2 52774

#define IR1 12211

#define IR2 3791

#define NTAB 32

#define NDIV (1+IMM1/NTAB)

#define EPS 1.0e-14

#define RNMX (1.0-EPS)

	long idum2 = 123456789;

	long iy = 0;

	long iv[NTAB];

__device__ double uniform_random(long *idum)

{

	long j, k;

	double tempCR;

	if (*idum <= 0) {

		if (-(*idum) < 1) *idum=1;

		else *idum = -(*idum);

		idum2=(*idum);

		for (j=NTAB+7; j>=0; j--) {

			k=(*idum)/IQ1;

			*idum=IA1*(*idum-k*IQ1)-k*IR1;

			if (*idum < 0) *idum += IM1;

			if (j < NTAB) iv[j] = *idum;

		}

		iy=iv[0];

	}

	k=(*idum)/IQ1;

	*idum=IA1*(*idum-k*IQ1)-k*IR1;

	if (*idum < 0) *idum += IM1;

	k=idum2/IQ2;

	idum2=IA2*(idum2-k*IQ2)-k*IR2;

	if (idum2 < 0) idum2 += IM2;

	j=iy/NDIV;

	iy=iv[j]-idum2;

	iv[j] = *idum;

	if (iy < 1) iy += IMM1;

	if ((tempCR=AM*iy) > RNMX) return RNMX; else return tempCR;

}

/* Variable declarations for the random number generator */

double seed;

int	rndcalcflag;

double rndx1, rndx2;

/* Get seed number for random and start it up */

__device__ void randomize()

{

	int j1;

	for ( j1 = 0; j1 <= 54; j1++ )

		oldrand[j1] = 0.0;

	jrand = 0;

	warmup_random(seed);

	return;

}

/* Get randomize off and running */

__device__ void warmup_random (double seed)

{

	int j1, ii;

	double new_random, prev_random;

	oldrand[54] = seed;

	new_random = 0.000000001;

	prev_random = seed;

	for( j1 = 1; j1 <= 54; j1++ ) {

		ii = (21*j1)%54;

		oldrand[ii] = new_random;

		new_random = prev_random-new_random;

		if(new_random<0.0)

			new_random += 1.0;

		prev_random = oldrand[ii];

	}

	advance_random ();

	advance_random ();

	advance_random ();

	jrand = 0;

	return;

}

/* Create next batch of 55 random numbers */

__device__ void advance_random ()

{

	int j1;

	double new_random;

	for(j1=0; j1<24; j1++)

	{

		new_random = oldrand[j1]-oldrand[j1+31];

		if(new_random<0.0)

			new_random = new_random+1.0;

		oldrand[j1] = new_random;

	}

	for(j1=24; j1<55; j1++)

	{

		new_random = oldrand[j1]-oldrand[j1-24];

		if(new_random<0.0)

			new_random = new_random+1.0;

		oldrand[j1] = new_random;

	}

}

/* Fetch a single random number between 0.0 and 1.0 */

__device__ double randomperc()

{

	jrand++;

	if ( jrand >= 55 ) {

		jrand = 1;

		advance_random();

	}

	return (double)oldrand[jrand];

}

/* Fetch a single random integer between low and high including the bounds */

__device__ int randint(int low, int high)

{

	int res;

	if (low >= high)

		res = low;

	else {

		res = low + (int)(randomperc()*(high-low+1));

		if (res > high) res = high;

	}

	return res;

}

/* Fetch a single random real number between low and high including the bounds */

__device__ double randreal(double low, double high)

{

	return (low + (high-low)*randomperc());

}

/* initialize_pso the randome generator for normal distribution */

__device__ void initrandomnormaldeviate()

{

	rndcalcflag = 1;

	return;

}

/* Return the noise value */

__device__ double noise(double mu, double sigma)

{

	return (randomnormaldeviate()*sigma) + mu;

}

/* Compute the noise */

__device__ double randomnormaldeviate()

{

	double t;

	if(rndcalcflag) {

		rndx1 = sqrt(- 2.0*log(randomperc()));

		t = 6.2831853072*randomperc();

		rndx2 = sin(t);

		rndcalcflag = 0;

		return(rndx1*cos(t));

	} else {

		rndcalcflag = 1;

		return(rndx1*rndx2);

	}

}

================== main.cu ===========================

double *cuda(){

		double  *answer;

	dim3	threadsPerBlock(16,4);

	dim3	threadsPerGrid(4,4);

	answer = (double*)malloc(D*np*sizeof(double));

	cudaMalloc((void**)&mincost_p, D*np*sizeof(double));

	deCUDA<<<threadsPerGrid, threadsPerBlock>>>(mincost_p);

	cudaMemcpy(answer, mincost_p,size,cudaMemcpyDeviceToHost);

	return answer;

}

This is my code for CUDA programming. In main.cu calls cuda() function and it’ll return result to show in console.

Now, in the code above (de.cu),I must run deCUDA up to 100 rounds or more (now I defined 100 rounds) I think that I created threads for 1024 threads to work in deCUDA but I used only 100 threads to work it as shown in “index < 100”.

When I test this program with CUDA (with GF 9800GT 1G) , it still slower than CPU (Core i7 920 2.66 GHz) , I think that it slower than CPU because the workload is not enough (Small amount of workload) . Does it faster than CPU when I increase workload and How much workload should I increase for this program ?

Or it has the other way to make CUDA run faster than CPU.

If you have more time to unserstand DE algorithm, I bring the DE algorithm to show the main concept of DE.


Your average GPU will barely get out of bed to run a thousand threads. You either need to increase your problem size, or divide up your problem more finely.