Hi,
I am working on an algorithm and I was using CudaMalloc method to allocate memory. But I had to make multiple calls to kernel which means I would have to transfer data every time. So in order to avoid that data transfer, I created a device variable and allocated memory to this variable an a kernel function using malloc method and freed it using free().
Apparently these variables are being created on heap. heap also have limited memory but it can be extended at the start of the application.
My question is that if it would speed up the processing or should I try something else. Please guide me for better understanding.
Here is my Code for understanding (any suggestion related to code would also be appreciated, Thanks)
__device__ int *device_population;
__device__ int *device_populationOffSpring;
__device__ int *device_candidatesCount;
__device__ int *device_problemSize;
__device__ int *device_threadsCount;
__global__ void init_CudaRandom(unsigned int seed, curandState_t* states, int numberOfElements) {
/* we have to initialize the state */
int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i < numberOfElements)
{
curand_init(seed, /* the seed can be the same for each core, here we pass the time in from the CPU */
i, /* the sequence number should be different for each core (unless you want all
cores to get the same sequence of numbers for some reason - use thread id! */
0, /* the offset is how much extra we advance in the sequence for each call, can be 0 */
&states[i]);
}
}
__global__ void setBits(curandState_t* states, int totalElements)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
//printf("%d and totalElements are %d\n",i,totalElements);
if (i < totalElements)
{
device_population[i] = curand(&states[0]) % 2;
}
}
__global__ void initializePopulation(int numberOfCandidates, int problemSize)//, int *device_Pop)
{
//printf("");
device_problemSize = (int*)malloc(sizeof(int));
*device_problemSize = problemSize;
device_candidatesCount = (int*)malloc(sizeof(int));
*device_candidatesCount = numberOfCandidates;
device_population = (int*)malloc(sizeof(int)*(*device_candidatesCount)*(*device_problemSize));
device_populationOffSpring = (int*)malloc(sizeof(int)*(*device_candidatesCount)*(*device_problemSize));
device_threadsCount = (int*)malloc(sizeof(int));
if (numberOfCandidates < 2048)
{
*device_threadsCount = numberOfCandidates;
}
else
{
*device_threadsCount = 1024;
}
//Now initializing the population with random data;
curandState_t* states = NULL;
states = (curandState_t*)malloc(sizeof(curandState_t)*(*device_candidatesCount)*(*device_problemSize));
int tryCount = 0;
while (tryCount < 10 && states == NULL)
{
states = (curandState_t*)malloc(sizeof(curandState_t)*(*device_candidatesCount)*(*device_problemSize));
tryCount++;
}
/*if (states == NULL)
printf("fuckery at memory initilization \n");*/
int gridSize = (*device_candidatesCount)*(*device_problemSize) / (*device_threadsCount) ;
init_CudaRandom << <gridSize, *device_threadsCount >> > (1234,states,(*device_candidatesCount)*(*device_problemSize));
cudaDeviceSynchronize();
/*if(states==NULL)
printf("fuckery \n");*/
setBits<<<gridSize, *device_threadsCount >>>(states, (*device_candidatesCount)*(*device_problemSize));
cudaDeviceSynchronize();
/*for (int i = 0; i < (*device_candidatesCount)*(*device_problemSize); i++)
{
device_Pop[i] = device_population[i];
}*/
free(states);
}
__global__ void deletePopulation()
{
free(device_population);
free(device_populationOffSpring);
free(device_candidatesCount);
free(device_problemSize);
free(device_threadsCount);
}