To mfatica : that is my point and the present topics title. But is it global memory allocation that can span for the whole application or the actual pointer that holds the values. If you take a glance on the previous posts you’ll get an idea…
To sponge… : I basically have two pointers, pointing at constant data that will not change throughout the application. My application is an N-Body problem. I cannot use constant memory as i can have more than 40-50 000 bodies on my app so constant memory is totally out of the question. Apparently for 1d arrays global memory performs better than texture on Fermi cards (2.1 capability)…
So an ideal senario for me would be to have them resident on global memory for the whole application… Here is a very simple snippet that i am trying to run :
device static float* r0_dev;
device static float* kb_dev;
template
global void compute_bonds_energy2(float* e, Lock lock, float *pos_a,
float *pos_b, float *r0, float *kb) {
__shared__ float cache[threads];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
while (tid < molsize) {
printf("|index : %d, tid %d, pos : %lf", cacheIndex, tid, kb[tid]);
tid += blockDim.x * gridDim.x;
}
}
void bondGlobals(MMFF94S_bond_calcs_t* bondsIn, Bond_LookUp_t *lookUp_bonds,
int numAtoms) {
CUDA_SAFE_CALL(cudaMalloc((void**) &r0_dev, numAtoms * sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**) &kb_dev, numAtoms * sizeof(float)));
printf("global allocated \n");
}
float compute_e_bonds(MMFF94S_bond_calcs_t* bondsIn,
Bond_LookUp_t *lookUp_bonds, int numAtoms) {
CUDATimer cu_timer, cu_timer2;
int blocksPerGrid;
//wrapper function timer-----------------------
cu_timer2.Start();
//---------------------------------------------
blocksPerGrid
= imin(cuda::numBlocks,(numAtoms+cuda::threadsPerBlock - 1)/cuda::threadsPerBlock);
float e, *dev_e, *dev_pos_a, *dev_pos_b;//
//allocate pos_a
CUDA_SAFE_CALL(cudaMalloc(
(void**) &dev_pos_a,
3*lookUp_bonds -> bonds_struct_length
* sizeof(float)));
//allocate pos_b
CUDA_SAFE_CALL(cudaMalloc(
(void**) &dev_pos_b,
3*lookUp_bonds -> bonds_struct_length
* sizeof(float)));
CUDA_SAFE_CALL(cudaMemcpy(dev_pos_a, bondsIn -> pos_a,
3*lookUp_bonds -> bonds_struct_length * sizeof(float),
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(dev_pos_b, bondsIn -> pos_b,
3*lookUp_bonds -> bonds_struct_length * sizeof(float),
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(r0_dev, bondsIn -> r0, numAtoms * sizeof(float),
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(kb_dev, bondsIn -> kb, numAtoms * sizeof(float),
cudaMemcpyHostToDevice))
//----------------------------------------------------
//alocate e
CUDA_SAFE_CALL(cudaMalloc((void**) &dev_e, sizeof(float)));
//----------------------------------------------------
CUDA_SAFE_CALL(cudaMemcpy(dev_e, &e, sizeof(float),
cudaMemcpyHostToDevice));
Lock lock;
//---------------------------------------------
cudaThreadSynchronize();
cu_timer.Start();
compute_bonds_energy2<cuda::threadsPerBlock> <<<blocksPerGrid, cuda::threadsPerBlock>>>(dev_e, lock, dev_pos_a, dev_pos_b, r0_dev, kb_dev/*, dev_lengths, dev_offsets */);
CUT_CHECK_ERROR("Kernel execution failed");
//stop kernel timer and ouitput
cudaThreadSynchronize();
double t = cu_timer.GetET();
cudaFree(dev_pos_a);
cudaFree(dev_pos_b);
CUDA_SAFE_CALL(cudaMemcpy(&e, dev_e, sizeof(float),
cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(dev_e));
//stop wrapper function timer and output
double t2 = cu_timer.GetET();
printf("Time to generate bonds wrapper : %3.1lf ms \n", t2);
printf("Time to generate bonds kernel : %3.1lf ms \n", t);
cudaThreadSynchronize();
return e;
}
at the moment a call like
bondGlobals();
kernel();
kernel();
will execute for the first kernel and fail on the second…