Mejorar funcion secuencial. La funcion a mejorar es "escalculation"

Buenos días

Tengo un código que necesito paralelizar en CUDA. La funcion secuencial es la siguiente:

void forces_CPU_AU (int atoms_r, int atoms_l, int nlig, float *rec_x, float *rec_y, float *rec_z, float *lig_x, float *lig_y, float *lig_z, float *ql ,float *qr, float *energy, int nconformations){

double dist, total_elec = 0, miatomo[3], elecTerm;

int totalAtomLig = nconformations * nlig;

for (int k=0; k < totalAtomLig; k+=nlig){
  for(int i=0;i<atoms_l;i++){					
		miatomo[0] = *(lig_x + k + i);
		miatomo[1] = *(lig_y + k + i);
		miatomo[2] = *(lig_z + k + i);

		for(int j=0;j<atoms_r;j++){				
			elecTerm = 0;
    dist=calculaDistancia (rec_x[j], rec_y[j], rec_z[j], miatomo[0], miatomo[1], miatomo[2]);

// printf (“La distancia es %lf\n”, dist);
elecTerm = (ql[i]* qr[j]) / dist;
total_elec += elecTerm;
// printf (“La carga es %lf\n”, total_elec);
}
}

	energy[k/nlig] = total_elec;
	total_elec = 0;

}
printf(“Termino electrostatico %f\n”, energy[0]);
}

HE CREADO LA FUNCION EN CUDA, pero no me logra mejorar la velocidad de proceso:

global void escalculation (int atoms_r, int atoms_l, int nlig, float *rec_x_d, float *rec_y_d, float *rec_z_d, float *lig_x_d, float *lig_y_d, float *lig_z_d, float *ql_d,float *qr_d, float *energy_d, int nconformations)
{
shared float idata[1024];
shared float resultado;

int row =	blockIdx.y * blockDim.y + threadIdx.y;
int col =   blockIdx.x * blockDim.x + threadIdx.x;

int tidx = threadIdx.x;
int k;

float dist = 0;

float temp[1024];

idata[tidx]=0;
temp[tidx]=0;

if (col < atoms_r && row < atoms_l)
{
for ( k=0; k < nconformations*nlig; k+=nlig)
{
idata[tidx]=0;
temp[tidx]=0;
//CALCULAMOS EL VALOR PARA UN BLOQUE

	dist=calculaDistancia (rec_x_d[col], rec_y_d[col], rec_z_d[col], lig_x_d[row+k], lig_y_d[row+k], lig_z_d[row+k]);
	 temp[tidx]  += (ql_d[row]* qr_d[col]) / dist;
         
 
      idata[tidx]= temp[tidx];
 		  __syncthreads();
   
	
   //LO REDUCIMOS	
  //reducion en bloques de 2
	for (int s = blockDim.x / 2; s > 0; s >>= 1) 
	{
		if (tidx < s) 
		{
            idata[tidx] += idata[tidx + s];
		}
		__syncthreads();
	}


 
	// write result for this block to global mem
	//OBTENEMOS EL VALOR EN resultado , todos los bloques.
	if (tidx == 0) 
	{
	  
	     {
		   
		    atomicAdd(&energy_d[k/nlig], idata[0]);
		  // if (k > 0)
		    // printf("Soy  Bloque %i resultado:%f bueno %f \n",k/nlig , lig_x_d[row+k], lig_x_d[row]);
		 }

}

}
}//fin IF

}

LA FUNCION QUE LLAMA A LA ANTERIOR ES:

void forces_GPU_AU (int atoms_r, int atoms_l, int nlig, float *rec_x, float *rec_y, float *rec_z, float *lig_x, float *lig_y, float *lig_z, float *ql ,float *qr, float *energy, int nconformations){

cudaError_t cudaStatus; //variable para recoger estados de cuda

int size=atoms_r * sizeof(float);
int size_l=atoms_l * sizeof(float);
int size_lig=(atoms_l+nconformations*nlig) * sizeof(float);
int size_ene=(nconformations) * sizeof(float);
 int tipo_device;
 int total_hilos;
 int hilos_bloque;
 int anchobloquex;
 int anchobloquey;

 tipo_device=0;   

//seleccionamos device
cudaSetDevice(tipo_device); 
//0 - Tesla K40 2880nucleos Maximum number of threads per multiprocessor:  2048
//                          Maximum number of threads per block:           1024

//vs 1 - Tesla K230 o gt230

//creamos memoria para los vectores para GPU _d (device)
float *rec_x_d, *rec_y_d, *rec_z_d, *qr_d, *lig_x_d, *lig_y_d, *lig_z_d, *ql_d, *energy_d;
	
//reservamos memoria para GPU
cudaMalloc( (void **)  &rec_x_d,atoms_r * sizeof(float));
cudaMalloc( (void **)  &rec_y_d,atoms_r * sizeof(float));
cudaMalloc( (void **)  &rec_z_d,atoms_r * sizeof(float));

cudaMalloc( (void **)  &qr_d,atoms_r * sizeof(float));
cudaMalloc( (void **)  &ql_d,atoms_l * sizeof(float));

cudaMalloc( (void **)  &lig_x_d,(atoms_r +nconformations*nlig) * sizeof(float));
cudaMalloc( (void **)  &lig_y_d,(atoms_r +nconformations*nlig ) * sizeof(float));
cudaMalloc( (void **)  &lig_z_d,(atoms_r +nconformations*nlig) * sizeof(float)); 

cudaMalloc( (void **)  &energy_d,( nconformations) * sizeof(float)); 

//pasamos datos de host to device
cudaMemcpy(rec_x_d,rec_x,size, cudaMemcpyHostToDevice);
cudaMemcpy(rec_y_d,rec_y,size, cudaMemcpyHostToDevice);
cudaMemcpy(rec_z_d,rec_z,size, cudaMemcpyHostToDevice);

cudaMemcpy(qr_d,qr,size, cudaMemcpyHostToDevice);
cudaMemcpy(ql_d,ql,size_l, cudaMemcpyHostToDevice);

cudaMemcpy(lig_x_d,lig_x,size_lig, cudaMemcpyHostToDevice);
cudaMemcpy(lig_y_d,lig_y,size_lig, cudaMemcpyHostToDevice);
cudaMemcpy(lig_z_d,lig_z,size_lig, cudaMemcpyHostToDevice);

cudaMemcpy(energy_d,energy,size_ene, cudaMemcpyHostToDevice);

if (tipo_device == 0 )	
   total_hilos=1024; //por bloque
else    
   total_hilos=1024;


//anchobloque=1024;
total_hilos=atoms_r*atoms_l;
hilos_bloque=1024;

 anchobloquex = 1024;
 anchobloquey =1;
//dimgrid (x,Y)
dim3 dimGrid(ceil((atoms_r+anchobloquex-1)/anchobloquex),ceil((atoms_l+anchobloquey-1)/anchobloquey));
dim3 dimBlock(anchobloquex,anchobloquey); //1024 threads por bloque


//Definir numero de hilos y bloques
printf("bloques: %d\n", (int)ceil(total_hilos/hilos_bloque)+1);
printf("hilos por bloque: %d\n", hilos_bloque);

//llamamos a kernel
escalculation <<< dimGrid ,dimBlock>>> (atoms_r, atoms_l, nlig, rec_x_d, rec_y_d, rec_z_d, lig_x_d, lig_y_d, lig_z_d, ql_d, qr_d, energy_d, nconformations);

//control de errores kernel
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if(cudaStatus != cudaSuccess) fprintf(stderr, "Error en el kernel %d\n", cudaStatus); 

//Traemos info al host
cudaMemcpy(energy,energy_d, size_ene,cudaMemcpyDeviceToHost);

// para comprobar que la ultima conformacion tiene el mismo resultado que la primera
printf("Termino electrostatico de conformacion %d es: %f\n", nconformations-1, energy[nconformations-1]); 

//resultado varia repecto a SECUENCIAL y CUDA en 0.000002 por falta de precision con float
//posible solucion utilizar double, probablemente bajara el rendimiento -> mas tiempo para calculo
printf("Termino electrostatico GPU %f\n", energy[0]);

//Liberamos memoria reservada para GPU
cudaFree(rec_x_d);
cudaFree(rec_y_d);
cudaFree(rec_z_d);

cudaFree(qr_d);
cudaFree(ql_d);

cudaFree(lig_x_d);
cudaFree(lig_y_d);
cudaFree(lig_z_d);

cudaFree(energy_d);

}

Y CALCULAR DISTANCIA ES:

/**

  • Distancia euclidea compartida por funcion CUDA y CPU secuencial
    */
    device host extern float calculaDistancia (float rx, float ry, float rz, float lx, float ly, float lz) {

    float difx = rx - lx;
    float dify = ry - ly;
    float difz = rz - lz;
    float mod2x=difxdifx;
    float mod2y=dify
    dify;
    float mod2z=difz*difz;
    difx=mod2x+mod2y+mod2z;
    return sqrtf(difx);
    }

NO LOGRO MEJORAR LA FUNCION CUDA, a ver si alguien me puede ayudar. Gracias.

First, it’s always a good idea to profile your code with Nsight Compute to get insight to bottlenecks and issues. Check out https://devblogs.nvidia.com/using-nsight-compute-to-inspect-your-kernels/.

Second, your static array temp is mostly likely causing your register count to spill into local memory. Local memory is in global memory and suffers for the same bandwidth limitations.

float temp [1024];

It looks like you’re trying to allot on element in temp to each thread in a block. But you are actually asking each thread in the block to create an array of size 1024.

If I’m reading the code correctly, I think you only need on value per thread.

float temp;

Third, when you reduce the sum of idata, may I suggest you try CUB BlockReduce https://nvlabs.github.io/cub/classcub_1_1_block_reduce.html. CUB is library tuned by NVIDIA. Increases portability.

This section of the code is not making sense. The comment suggest you’re creating blocks of size (32,32), but the code looks like (1024,1024)???

if (service_type == 0)
thread_total = 1024; // per block
else
total_hilos = 1024;


// width block = 1024;
total_hilos = atoms_r * atoms_l;
block_threads = 1024;

widthblock = 1024;
anchobloy = 1;
// dimgrid (x, Y)
dim3 dimGrid (ceil ((atoms_r + anchoblox-1) / anchoblox), ceil ((atoms_l + anchoblock-1) / anchoblock));
dim3 dimBlock (width block, width block); // 1024 threads per block

You are using atomicAdd and there’s nothing wrong with that. But depending on how many blocks you launch. It might be faster to store the answer for each block in an array in global memory. Then you have two options. If the array is very large you could try something like CUB DeviceReduce. Or you can just copy everything to a vector on the host and use std::accumulate. I would check all three methods for performance. Also, check out https://thrust.github.io/.

atomicAdd (& energy_d [k / nlig], idata [0]);

I don’t really see where you’re using

__shared__ float result;

I suggest using cudaLaunchKernel over <<<>>>. https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html