================== de.cu ===========================
//=========================================================
// Main entrance of the standard DE
//=========================================================
__constant__ int NP_CUDA = NP; // NP = 100 : Population size or the no. of individuals
__constant__ int maxnfc_CUDA = 10 * 10000;
__constant__ int maxrun_CUDA = MAXRUN; // MAXRUN = 100;
__constant__ int D_CUDA = DIM; // DIM = 101
__constant__ double F_CUDA = 0.5;
__constant__ double CR_CUDA = 0.9;
__device__ long lseed = -1234;
__device__ int gen = 0; //Generation
__device__ int jrand;
__device__ int imin = 0; // index to member with the best cost (obj. func. val.)
__device__ double trial_cost; // f. of the trial vector
__device__ double mincost = 0; // obj. func. val. of the best
__device__ double cost[MAXPOP]; // objective function values (f.)
__device__ double best[MAXDIM], bestit[MAXDIM];
__device__ double *pswap_cu;
__device__ double *x_d;
__device__ double *curpop_cu[MAXPOP];
__device__ double *nxtpop_cu[MAXPOP];
__device__ double oldrand[55];
__device__ void reset_de(){
int n, d;
// Initialize population
for ( n = 0; n < NP_CUDA; n++ ) {
for ( d = 0; d < D_CUDA; d++ ){
curpop_cu[n][d] = randreal(low[d], hi[d]);
}
cost[n] = DEevaluate(curpop_cu[n]); // Calculate Cost value
}
// Find the best individual at startup
imin = 0;
mincost = cost[0];
for ( n = 1; n < NP_CUDA; n++ )
if ( cost[n] < mincost ) {
mincost = cost[n];
imin = n;
}
dcopy_CUDA(D_CUDA, best, curpop_cu[imin]); // save best member ever
dcopy_CUDA(D_CUDA, bestit, curpop_cu[imin]); // save best member of generation
}
__device__ void dcopy_CUDA (int D, double dest[], double source[]){
int j;
for (j=0; j<D; j++)
dest[j] = source[j];
}
__device__ void differential_mutation(int target)
{
int n, L;
int r1, r2, r3; // สุ่ม 3 vector
do r1 = (int)(unirandom()*NP_CUDA); while (r1==target);
do r2 = (int)(unirandom()*NP_CUDA); while (r2==target || r2==r1);
do r3 = (int)(unirandom()*NP_CUDA); while (r3==target || r3==r1 || r3==r2);
/*-------DE/rand/1/bin--------------------------------*/
dcopy_CUDA(D_CUDA, x_d, curpop_cu[target]);
n = (int)( unirandom() * D_CUDA );
for ( L = 0; L < D_CUDA; L++ ) { /* perform D binomial trials */
if ( unirandom() < CR_CUDA ||
L == D_CUDA-1 ) /* change at least 1 parameter */
x_d[n] = curpop_cu[r1][n] + F_CUDA*( curpop_cu[r2][n] - curpop_cu[r3][n] );
n = ( n + 1 ) % D_CUDA;
}
}
__device__ double Sphere(double x[])
{
int i;
double res = 0.0;
for ( i = 0; i < D_CUDA; i++ )
res += x[i]*x[i];
return res;
}
__device__ double DEevaluate(double x[])
{
double ret = Sphere(x);
nfc++;
if ( nfc % logstep2file == 0 || nfc == 500 ) {
lognfc[logc] = nfc;
logval[logc] += mincost;
logc++;
}
// Onscreen logging
if ( nfc == maxnfc_CUDA) {
printf("%s\t%s\tR%02d\t%d\t%d\t%10.04g\n",
"RAND1B","F1-Sphere", nrun, gen, nfc, mincost);
fflush(stdout);
}
return ret;
}
__global__ void deCUDA(double* mincost_CU)
{
double temp[MAXRUN];
int target, d, i, m=0,n;
int tx = threadIdx.x;
int ty = threadIdx.y;
int index = (ty*16+tx) + ((blockIdx.x*64)+(blockIdx.y*256));
if( index < 100){
printf("thread ID = %d\n ",index);
for(n = 0; n < NP_CUDA; n++){
curpop_cu[n] = (double *)calloc(D_CUDA, sizeof(double));
nxtpop_cu[n] = (double *)calloc(D_CUDA, sizeof(double));
}
randomize();
x_d = (double *)calloc(D_CUDA, sizeof(double));
reset_de();
nfc = 0;
logc = 0;
for ( gen = 1; nfc <= maxnfc_CUDA; gen++ ) {
imin = 0;
for ( target = 0; target < NP_CUDA; target++ ) {
differential_mutation(target);
// Trial mutation now in x[]. Check its boundaries
for ( d = 0; d < D_CUDA; d++) {
if ( x_d[d] < low[d] )
x_d[d] = randreal ( low[d], 0.005*(hi[d]-low[d]) );
else if ( x_d[d] > hi[d] ) {
double space = 0.005 * ( hi[d] - low[d] );
x_d[d] = randreal( hi[d] - space, space );
}
}
trial_cost = DEevaluate(x_d);
if (trial_cost <= cost[target]) { // Is objective function improved?
cost[target] = trial_cost;
dcopy_CUDA(D_CUDA, nxtpop_cu[target], x_d);
if ( trial_cost < mincost ) { // Was this a new minimum?
mincost = trial_cost; // If so, set mincost to new low.
imin = target;
dcopy_CUDA(D_CUDA, best, x_d);
}
} else {
dcopy_CUDA(D_CUDA, nxtpop_cu[target], curpop_cu[target]);
}
}
dcopy_CUDA(D_CUDA, bestit, best); // Save best member of the current gen.
for ( i = 0; i < NP_CUDA; i++ ) {
pswap_cu = curpop_cu[i];
curpop_cu[i] = nxtpop_cu[i];
nxtpop_cu[i] = pswap_cu; // swap pop. for next gen.
}
if(nfc == maxnfc_CUDA){
temp[m] = mincost;
m++;
}
}
nrun++;
putchar('\n');
for(int f=0; f<maxrun_CUDA; f++){
mincost_CU[f] = temp[f];
}
}
}
void dcopy (int D, double dest[], double source[]){
register int j;
for (j=0; j<D; j++)
dest[j] = source[j];
}
/*------Constants for uniform_random()--------------------------------------------*/
#define IM1 2147483563
#define IM2 2147483399
#define AM (1.0/IM1)
#define IMM1 (IM1-1)
#define IA1 40014
#define IA2 40692
#define IQ1 53668
#define IQ2 52774
#define IR1 12211
#define IR2 3791
#define NTAB 32
#define NDIV (1+IMM1/NTAB)
#define EPS 1.0e-14
#define RNMX (1.0-EPS)
long idum2 = 123456789;
long iy = 0;
long iv[NTAB];
__device__ double uniform_random(long *idum)
{
long j, k;
double tempCR;
if (*idum <= 0) {
if (-(*idum) < 1) *idum=1;
else *idum = -(*idum);
idum2=(*idum);
for (j=NTAB+7; j>=0; j--) {
k=(*idum)/IQ1;
*idum=IA1*(*idum-k*IQ1)-k*IR1;
if (*idum < 0) *idum += IM1;
if (j < NTAB) iv[j] = *idum;
}
iy=iv[0];
}
k=(*idum)/IQ1;
*idum=IA1*(*idum-k*IQ1)-k*IR1;
if (*idum < 0) *idum += IM1;
k=idum2/IQ2;
idum2=IA2*(idum2-k*IQ2)-k*IR2;
if (idum2 < 0) idum2 += IM2;
j=iy/NDIV;
iy=iv[j]-idum2;
iv[j] = *idum;
if (iy < 1) iy += IMM1;
if ((tempCR=AM*iy) > RNMX) return RNMX; else return tempCR;
}
/* Variable declarations for the random number generator */
double seed;
int rndcalcflag;
double rndx1, rndx2;
/* Get seed number for random and start it up */
__device__ void randomize()
{
int j1;
for ( j1 = 0; j1 <= 54; j1++ )
oldrand[j1] = 0.0;
jrand = 0;
warmup_random(seed);
return;
}
/* Get randomize off and running */
__device__ void warmup_random (double seed)
{
int j1, ii;
double new_random, prev_random;
oldrand[54] = seed;
new_random = 0.000000001;
prev_random = seed;
for( j1 = 1; j1 <= 54; j1++ ) {
ii = (21*j1)%54;
oldrand[ii] = new_random;
new_random = prev_random-new_random;
if(new_random<0.0)
new_random += 1.0;
prev_random = oldrand[ii];
}
advance_random ();
advance_random ();
advance_random ();
jrand = 0;
return;
}
/* Create next batch of 55 random numbers */
__device__ void advance_random ()
{
int j1;
double new_random;
for(j1=0; j1<24; j1++)
{
new_random = oldrand[j1]-oldrand[j1+31];
if(new_random<0.0)
new_random = new_random+1.0;
oldrand[j1] = new_random;
}
for(j1=24; j1<55; j1++)
{
new_random = oldrand[j1]-oldrand[j1-24];
if(new_random<0.0)
new_random = new_random+1.0;
oldrand[j1] = new_random;
}
}
/* Fetch a single random number between 0.0 and 1.0 */
__device__ double randomperc()
{
jrand++;
if ( jrand >= 55 ) {
jrand = 1;
advance_random();
}
return (double)oldrand[jrand];
}
/* Fetch a single random integer between low and high including the bounds */
__device__ int randint(int low, int high)
{
int res;
if (low >= high)
res = low;
else {
res = low + (int)(randomperc()*(high-low+1));
if (res > high) res = high;
}
return res;
}
/* Fetch a single random real number between low and high including the bounds */
__device__ double randreal(double low, double high)
{
return (low + (high-low)*randomperc());
}
/* initialize_pso the randome generator for normal distribution */
__device__ void initrandomnormaldeviate()
{
rndcalcflag = 1;
return;
}
/* Return the noise value */
__device__ double noise(double mu, double sigma)
{
return (randomnormaldeviate()*sigma) + mu;
}
/* Compute the noise */
__device__ double randomnormaldeviate()
{
double t;
if(rndcalcflag) {
rndx1 = sqrt(- 2.0*log(randomperc()));
t = 6.2831853072*randomperc();
rndx2 = sin(t);
rndcalcflag = 0;
return(rndx1*cos(t));
} else {
rndcalcflag = 1;
return(rndx1*rndx2);
}
}
================== main.cu ===========================
double *cuda(){
double *answer;
dim3 threadsPerBlock(16,4);
dim3 threadsPerGrid(4,4);
answer = (double*)malloc(D*np*sizeof(double));
cudaMalloc((void**)&mincost_p, D*np*sizeof(double));
deCUDA<<<threadsPerGrid, threadsPerBlock>>>(mincost_p);
cudaMemcpy(answer, mincost_p,size,cudaMemcpyDeviceToHost);
return answer;
}
This is my code for CUDA programming. In main.cu calls cuda() function and it’ll return result to show in console.
Now, in the code above (de.cu),I must run deCUDA up to 100 rounds or more (now I defined 100 rounds) I think that I created threads for 1024 threads to work in deCUDA but I used only 100 threads to work it as shown in “index < 100”.
When I test this program with CUDA (with GF 9800GT 1G) , it still slower than CPU (Core i7 920 2.66 GHz) , I think that it slower than CPU because the workload is not enough (Small amount of workload) . Does it faster than CPU when I increase workload and How much workload should I increase for this program ?
Or it has the other way to make CUDA run faster than CPU.
If you have more time to unserstand DE algorithm, I bring the DE algorithm to show the main concept of DE.