how to use kernel with 2 loop for?

hi.i Would like to process Function_1 on cuda. Like this:
but when I’m running my_kernel ,I get nothing!!! How to use loop_1 and loop_2 in kernel?
Please help me!
kernel:

# define F 20
__global__ void kernel(double* dev_fitness,double* dev_prob)
{
    int i = blockIdx.x;
    double maxfit;
    maxfit=dev_fitness[0];
    if(i<F)
    {
        if(dev_fitness[i]>maxfit)
            maxfit=dev_fitness[i];
    }
    if(i<F)
    {
        dev_prob[i]=(0.9*(dev_fitness[i]/maxfit))+0.1;
    }
}

//Function_1:
[code]void Function_1()
{
     int i;
     double maxfit;
     maxfit=fitness[0];
//loop_1
  for (i=1;i<F;i++)
        {
           if (fitness[i]>maxfit)
           maxfit=fitness[i];
        }
//lopp_2
 for (i=0;i<F;i++)
        {
         prob[i]=(0.9*(fitness[i]/maxfit))+0.1;
        }

}

For the first loop you need a reduction code.

#include <cuda.h>
#include <stdio.h>
#include <time.h>

#define tbp 512
__global__ void kernel_min(int *a, int *d,int nn)
{
__shared__ int sdata[tbp]; //"static" shared memory

unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid]=100000000;
if(i<nn) 
{
sdata[tid] = a[i];
}

__syncthreads();
for(unsigned int s=tbp/2 ; s > = 1 ; s=s/2)
{
if(tid < s)
{
if(sdata[tid] > sdata[tid + s])
{
sdata[tid] = sdata[tid + s];
}
}
__syncthreads();
}
if(tid == 0 ) 
{
d[blockIdx.x] = sdata[0];
}
}

int main()
{
const int N=3000;
const int nblocks=(N+tbp-1)/tbp;
srand(time(NULL));

int *a;
a = (int*)malloc(N * sizeof(int));
int *d;
d = (int*)malloc(nblocks * sizeof(int));

int *dev_a, *dev_d;

cudaMalloc((void **) &dev_a, N*sizeof(int));
cudaMalloc((void **) &dev_d, nblocks*sizeof(int));
int mmm=100;
for(int i = 0 ; i < N ; i++)
{
a[i] = rand()% 100 + 5;
//printf("%d \n",a[i]);
if(mmm > a[i]) mmm=a[i];

}
printf("");
printf("");
printf("");
printf("");
cudaMemcpy(dev_a , a, N*sizeof(int),cudaMemcpyHostToDevice);

kernel_min< < < nblocks,tbp > > >(dev_a,dev_d,N);
kernel_min< < <1,tbp> > >(dev_d,dev_d,nblocks);

cudaMemcpy(d, dev_d, nblocks*sizeof(int),cudaMemcpyDeviceToHost);

printf("cpu min %d, gpu_min = %d\n",mmm,d[0]);

cudaFree(dev_a);
cudaFree(dev_d);

printf("");

return 0;
}

This reduction code is to find the min and it works only for N<=512*512.

For the second function you can use a simple kernel (you have to put in the appropriate parameters):

__global void normalizze(....,int N)
{int i=theradIdx.x+blockIdx.x*tbp;
if(i < N)
{
dev_prob[i]=(0.9*(dev_fitness[i]/maxfit))+0.1;
}
}

And you call it after you finish the the kernel_min (or max) calls with

normalizze < < < nblocks,tbp > > >(...,N);

thank you
I searched a lot, about 2d array in cuda and I read nvidia c++ guide, to find something about 2d array and I only get to allocate memory I should use “cudamallocpitch” and dim3
But I don’t know how to convert this code to cuda!
Please explain me how can I do this?

void main()
{
int F=40
int D=80
int i,j;
double GlobalParams[D]
double Foo[F][D]
     for(i=0;i<F;i++)
       //only second for to convert cuda
        if(...)
        { 
        for (j=0;j<D;j++)
           GlobalParams[j]=Foo[i][j];
         }
}

If you have a 2D aray depends on what are you using it for. In my codes it was enough to map it to a 1D array such as a[i][j]–>dev_a[i+j*lx]. In CUDA 2d arrays have a special meaning. They are optimized for 2d access such all neighbours of an element [i,j]. For your code above I am not sure what do you want to do.

I want run this code on GPU,
Column of foo copy in row of GlobalParams

for (j=0;j<D;j++)
    GlobalParams[j]=Foo[i][j];

I try to convert but part of code that about Foods array I’m sure it’s nut true
help me
thank you
I’m sorry,my English is bad.

#define D 40
#define F 80

__global__ void Func_1(double* dev_GlobalParams,double* dev_Foods,int i )//* ???
{
	int j=blockIdx.y*blockDim.y+threadIdx.y; //* ???
	if(j<D)
		dev_GlobalParams[j]=Foods[i][j];
}


void main()
{
   int i,j;
   double GlobalParams[D]; //* 1D array 40
   double foods[F][D];     //* 2D array 80,40

    double* dev_GlobalParams;
	size_t GlobalParamsSize=D*sizeof(double);
	cudaMalloc(&dev_GlobalParams,GlobalParamsSize);
	cudaMemcpy(dev_GlobalParams,GlobalParams,GlobalParamsSize,cudaMemcpyHostToDevice);
	//-----------------------
	double* Foods;       //* for 2d array is it corect?
	dim3 FoodsSize(F,D); //* ???
	size_t pitch;        //* ???
	cudaMallocPitch(&dev_Foods,pitch);    //* ??? 
	cudaMemcpy(dev_Foods,Foods,FoodsSize,CudaMemcpyHostToDevice);//* ???


	for (i=0;i<F;i++)
	{
	if (f[i]<GlobalMin)
		{
        GlobalMin=f[i];
		Func_1<<<FoodSize,1>>>(dev_GlobalParams,dev_Foods,i);

        }
	}
 }

Hello,

I got it now. I have no idea how to use the malloc pitch, but you can use something else whichworked for me. Define the 2D matrix as an array of pointers

double *foods[F],*dev_food[F];
// now allocate the memory on host and gpu with a loop
for(int istr=0;istr < F;istr++)
{
cudaHostAlloc(&food[istr],sizeof(double)*D,cudaHostAllocDefault);
cudaMalloc((void**)&dev_food[istr[istr],D*sizeof(double));
}
// on host you can access the elements of food as usually with [i][j]
// another loop of rthe copying of data
for(int istr=0;istr < F;istr++)
{
cudaMemcpy(dev_Foods[istr],Foods[istr],sizeof(double)*D,CudaMemcpyHostToDevice);
}
// now you can make the loop 
for(int i=0; i < F; i++)
{
// something ...
newFunc_l< < < (D-1+ntbp)/ntbp,ntpb > > > (dev_GlobalParams,dev_Foods[i],D);
}

You have to define the number of threads per block ntbp
The new kernel is below:

__global__ void newFunc_1(double* dev_GlobalParams,double* dev_Foods,int D)
{
int j=blockId.x*lockSize.x+threadIdx.x;
if(j < D)
{
GlobalParams[j]=dev_Foods[j]
}
}

I do not understand.!!!
I’m really confused.I think nvidia need a specific compiler! that programming get easier.
Would you please show me another example.show me foe example we have tow array in host named h_array1and h_array2 and we want copy h_array1 to h_array2 by cuda.size of them is [10][15]

You can not define a 2D array on gpu similar to the cpu version. If on cpu you define cpu_arrray[F][D], you can not do on gpu gpu_array[F][D]. Practical there are no higher dimensional arrays on gpu. All arrray are mapped to a 1D array. So a matrix of [1:F]x[1:D] size will be defined as a 1D array of [1:F*D]. If you need to work with line like in you case you can define F pointers each pointers pointing to an array of size D. My code works for what you showed so far.

there is no h_array2 in cuda.

good lord, 2D array pointers is a disaster in CUDA, not only confusing, but wasteful in resources. It’s a lot better to just flatten the array into 1D indexing like so:

http://stackoverflow.com/questions/5631115/2d-array-on-cuda