Mejorar funcion secuencial. La funcion a mejorar es "escalculation"

felipe.ser · December 28, 2019, 3:31pm

Buenos días

Tengo un código que necesito paralelizar en CUDA. La funcion secuencial es la siguiente:

void forces_CPU_AU (int atoms_r, int atoms_l, int nlig, float *rec_x, float *rec_y, float *rec_z, float *lig_x, float *lig_y, float *lig_z, float *ql ,float *qr, float *energy, int nconformations){

double dist, total_elec = 0, miatomo[3], elecTerm;

int totalAtomLig = nconformations * nlig;

for (int k=0; k < totalAtomLig; k+=nlig){
  for(int i=0;i<atoms_l;i++){					
		miatomo[0] = *(lig_x + k + i);
		miatomo[1] = *(lig_y + k + i);
		miatomo[2] = *(lig_z + k + i);

		for(int j=0;j<atoms_r;j++){				
			elecTerm = 0;
    dist=calculaDistancia (rec_x[j], rec_y[j], rec_z[j], miatomo[0], miatomo[1], miatomo[2]);

// printf (“La distancia es %lf\n”, dist);
elecTerm = (ql[i]* qr[j]) / dist;
total_elec += elecTerm;
// printf (“La carga es %lf\n”, total_elec);
}
}

	energy[k/nlig] = total_elec;
	total_elec = 0;

}
printf(“Termino electrostatico %f\n”, energy[0]);
}

HE CREADO LA FUNCION EN CUDA, pero no me logra mejorar la velocidad de proceso:

global void escalculation (int atoms_r, int atoms_l, int nlig, float *rec_x_d, float *rec_y_d, float *rec_z_d, float *lig_x_d, float *lig_y_d, float *lig_z_d, float *ql_d,float *qr_d, float *energy_d, int nconformations)
{
shared float idata[1024];
shared float resultado;

int row =	blockIdx.y * blockDim.y + threadIdx.y;
int col =   blockIdx.x * blockDim.x + threadIdx.x;

int tidx = threadIdx.x;
int k;

float dist = 0;

float temp[1024];

idata[tidx]=0;
temp[tidx]=0;

if (col < atoms_r && row < atoms_l)
{
for ( k=0; k < nconformations*nlig; k+=nlig)
{
idata[tidx]=0;
temp[tidx]=0;
//CALCULAMOS EL VALOR PARA UN BLOQUE

	dist=calculaDistancia (rec_x_d[col], rec_y_d[col], rec_z_d[col], lig_x_d[row+k], lig_y_d[row+k], lig_z_d[row+k]);
	 temp[tidx]  += (ql_d[row]* qr_d[col]) / dist;
         
 
      idata[tidx]= temp[tidx];
 		  __syncthreads();
   
	
   //LO REDUCIMOS	
  //reducion en bloques de 2
	for (int s = blockDim.x / 2; s > 0; s >>= 1) 
	{
		if (tidx < s) 
		{
            idata[tidx] += idata[tidx + s];
		}
		__syncthreads();
	}


 
	// write result for this block to global mem
	//OBTENEMOS EL VALOR EN resultado , todos los bloques.
	if (tidx == 0) 
	{
	  
	     {
		   
		    atomicAdd(&energy_d[k/nlig], idata[0]);
		  // if (k > 0)
		    // printf("Soy  Bloque %i resultado:%f bueno %f \n",k/nlig , lig_x_d[row+k], lig_x_d[row]);
		 }

}

}
}//fin IF

}

LA FUNCION QUE LLAMA A LA ANTERIOR ES:

void forces_GPU_AU (int atoms_r, int atoms_l, int nlig, float *rec_x, float *rec_y, float *rec_z, float *lig_x, float *lig_y, float *lig_z, float *ql ,float *qr, float *energy, int nconformations){

cudaError_t cudaStatus; //variable para recoger estados de cuda

int size=atoms_r * sizeof(float);
int size_l=atoms_l * sizeof(float);
int size_lig=(atoms_l+nconformations*nlig) * sizeof(float);
int size_ene=(nconformations) * sizeof(float);
 int tipo_device;
 int total_hilos;
 int hilos_bloque;
 int anchobloquex;
 int anchobloquey;

 tipo_device=0;   

//seleccionamos device
cudaSetDevice(tipo_device); 
//0 - Tesla K40 2880nucleos Maximum number of threads per multiprocessor:  2048
//                          Maximum number of threads per block:           1024

//vs 1 - Tesla K230 o gt230

//creamos memoria para los vectores para GPU _d (device)
float *rec_x_d, *rec_y_d, *rec_z_d, *qr_d, *lig_x_d, *lig_y_d, *lig_z_d, *ql_d, *energy_d;
	
//reservamos memoria para GPU
cudaMalloc( (void **)  &rec_x_d,atoms_r * sizeof(float));
cudaMalloc( (void **)  &rec_y_d,atoms_r * sizeof(float));
cudaMalloc( (void **)  &rec_z_d,atoms_r * sizeof(float));

cudaMalloc( (void **)  &qr_d,atoms_r * sizeof(float));
cudaMalloc( (void **)  &ql_d,atoms_l * sizeof(float));

cudaMalloc( (void **)  &lig_x_d,(atoms_r +nconformations*nlig) * sizeof(float));
cudaMalloc( (void **)  &lig_y_d,(atoms_r +nconformations*nlig ) * sizeof(float));
cudaMalloc( (void **)  &lig_z_d,(atoms_r +nconformations*nlig) * sizeof(float)); 

cudaMalloc( (void **)  &energy_d,( nconformations) * sizeof(float)); 

//pasamos datos de host to device
cudaMemcpy(rec_x_d,rec_x,size, cudaMemcpyHostToDevice);
cudaMemcpy(rec_y_d,rec_y,size, cudaMemcpyHostToDevice);
cudaMemcpy(rec_z_d,rec_z,size, cudaMemcpyHostToDevice);

cudaMemcpy(qr_d,qr,size, cudaMemcpyHostToDevice);
cudaMemcpy(ql_d,ql,size_l, cudaMemcpyHostToDevice);

cudaMemcpy(lig_x_d,lig_x,size_lig, cudaMemcpyHostToDevice);
cudaMemcpy(lig_y_d,lig_y,size_lig, cudaMemcpyHostToDevice);
cudaMemcpy(lig_z_d,lig_z,size_lig, cudaMemcpyHostToDevice);

cudaMemcpy(energy_d,energy,size_ene, cudaMemcpyHostToDevice);

if (tipo_device == 0 )	
   total_hilos=1024; //por bloque
else    
   total_hilos=1024;


//anchobloque=1024;
total_hilos=atoms_r*atoms_l;
hilos_bloque=1024;

 anchobloquex = 1024;
 anchobloquey =1;
//dimgrid (x,Y)
dim3 dimGrid(ceil((atoms_r+anchobloquex-1)/anchobloquex),ceil((atoms_l+anchobloquey-1)/anchobloquey));
dim3 dimBlock(anchobloquex,anchobloquey); //1024 threads por bloque


//Definir numero de hilos y bloques
printf("bloques: %d\n", (int)ceil(total_hilos/hilos_bloque)+1);
printf("hilos por bloque: %d\n", hilos_bloque);

//llamamos a kernel
escalculation <<< dimGrid ,dimBlock>>> (atoms_r, atoms_l, nlig, rec_x_d, rec_y_d, rec_z_d, lig_x_d, lig_y_d, lig_z_d, ql_d, qr_d, energy_d, nconformations);

//control de errores kernel
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if(cudaStatus != cudaSuccess) fprintf(stderr, "Error en el kernel %d\n", cudaStatus); 

//Traemos info al host
cudaMemcpy(energy,energy_d, size_ene,cudaMemcpyDeviceToHost);

// para comprobar que la ultima conformacion tiene el mismo resultado que la primera
printf("Termino electrostatico de conformacion %d es: %f\n", nconformations-1, energy[nconformations-1]); 

//resultado varia repecto a SECUENCIAL y CUDA en 0.000002 por falta de precision con float
//posible solucion utilizar double, probablemente bajara el rendimiento -> mas tiempo para calculo
printf("Termino electrostatico GPU %f\n", energy[0]);

//Liberamos memoria reservada para GPU
cudaFree(rec_x_d);
cudaFree(rec_y_d);
cudaFree(rec_z_d);

cudaFree(qr_d);
cudaFree(ql_d);

cudaFree(lig_x_d);
cudaFree(lig_y_d);
cudaFree(lig_z_d);

cudaFree(energy_d);

}

Y CALCULAR DISTANCIA ES:

/**

Distancia euclidea compartida por funcion CUDA y CPU secuencial
*/
device host extern float calculaDistancia (float rx, float ry, float rz, float lx, float ly, float lz) {

float difx = rx - lx;
float dify = ry - ly;
float difz = rz - lz;
float mod2x=difxdifx;
float mod2y=difydify;
float mod2z=difz*difz;
difx=mod2x+mod2y+mod2z;
return sqrtf(difx);
}

NO LOGRO MEJORAR LA FUNCION CUDA, a ver si alguien me puede ayudar. Gracias.

mnicely · December 28, 2019, 8:25pm

First, it’s always a good idea to profile your code with Nsight Compute to get insight to bottlenecks and issues. Check out https://devblogs.nvidia.com/using-nsight-compute-to-inspect-your-kernels/.

Second, your static array temp is mostly likely causing your register count to spill into local memory. Local memory is in global memory and suffers for the same bandwidth limitations.

float temp [1024];

It looks like you’re trying to allot on element in temp to each thread in a block. But you are actually asking each thread in the block to create an array of size 1024.

If I’m reading the code correctly, I think you only need on value per thread.

float temp;

Third, when you reduce the sum of idata, may I suggest you try CUB BlockReduce https://nvlabs.github.io/cub/classcub_1_1_block_reduce.html. CUB is library tuned by NVIDIA. Increases portability.

This section of the code is not making sense. The comment suggest you’re creating blocks of size (32,32), but the code looks like (1024,1024)???

if (service_type == 0)
thread_total = 1024; // per block
else
total_hilos = 1024;


// width block = 1024;
total_hilos = atoms_r * atoms_l;
block_threads = 1024;

widthblock = 1024;
anchobloy = 1;
// dimgrid (x, Y)
dim3 dimGrid (ceil ((atoms_r + anchoblox-1) / anchoblox), ceil ((atoms_l + anchoblock-1) / anchoblock));
dim3 dimBlock (width block, width block); // 1024 threads per block

You are using atomicAdd and there’s nothing wrong with that. But depending on how many blocks you launch. It might be faster to store the answer for each block in an array in global memory. Then you have two options. If the array is very large you could try something like CUB DeviceReduce. Or you can just copy everything to a vector on the host and use std::accumulate. I would check all three methods for performance. Also, check out https://thrust.github.io/.

atomicAdd (& energy_d [k / nlig], idata [0]);

I don’t really see where you’re using

__shared__ float result;

I suggest using cudaLaunchKernel over <<<>>>. https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html

Topic		Replies	Views
Cuda program taking more time. CUDA Programming and Performance	15	7227	November 21, 2010
Effective Parallelisation of CUDA C code CUDA Programming and Performance	38	2427	December 27, 2021
2D reduction using CUDA The use a cuda and cublas library for a 2D simple reduction CUDA Programming and Performance	11	4645	February 7, 2012
Summing threads CUDA Programming and Performance	3	3255	June 7, 2011
CUDA kernel for-loop performance CUDA Programming and Performance	16	7036	September 7, 2019
Why cannot run this program CUDA Programming and Performance	7	1066	May 11, 2018
How to get faster than 5-8 seconds CUDA Programming and Performance	12	3212	September 15, 2008
Cuda code performance CUDA Programming and Performance	14	3346	December 16, 2014
Iteration help in CUDA CUDA Programming and Performance	11	7050	April 19, 2012
Newbie Question: Moving nested for loop CPU code to GPU. CUDA Programming and Performance	4	4579	December 2, 2015

Mejorar funcion secuencial. La funcion a mejorar es "escalculation"

Related topics