Hi, our project is recently updated to using multi-GPU, but we found the run-time of 4-GPU is worse than single-GPU. And run-time of 2-GPU is also worse than single-GPU. Part of the our code in the following shows the basic structure we designed for multi-GPU. This structure is similarly repeated everywhere, so we were wondering if there is something we missed or misunderstood.
//=======================================================================================
template< size_t term_ID, size_t power >
__global__ void interface_orm(
int ndim, int GPU_offset, double* device_Re_valueS, double* Rule, DEVICERatioS* device_RatioS ) {
//=======================================================================================
const int index = blockIdx.x * blockDim.x + threadIdx.x + GPU_offset;
double xx[max_n_dim];
for( int dim_Cindex = 0; dim_Cindex < ndim; ++dim_Cindex ) {
double xx_shift = index * Rule[dim_Cindex];
xx_shift = xx_shift - (int)xx_shift;
const double upper = 1.0;
const double lower = 0.0;
xx[dim_Cindex] = lower + (upper-lower) * xx_shift;
}//endfor
device_Re_valueS[index-GPU_offset] = abs( func<term_ID>( xx, device_RatioS ) );
}//end interface_orm
//=======================================================================================
template< size_t term_ID, size_t power >
double orm_daC( const int& ndim, const int& in_index, GLOBAL* global ) {
//=======================================================================================
#include "constants.inc"
for( int GPU_Cindex = 0; GPU_Cindex < global->n_GPU; ++GPU_Cindex ) {
checkCudaErrors( cudaSetDevice(GPU_Cindex) );
//-------------------------------------------
int GPU_offset = GPU_Cindex*n_T2/global->n_GPU;
//-------------------------------------------
interface_orm<term_ID, power>
<<< n_block_T2/global->n_GPU, block_size, 0, global->GPUBagS[GPU_Cindex].stream >>>
( ndim, GPU_offset,
global->GPUBagS[GPU_Cindex].device_Re_valueS,
global->GPUBagS[GPU_Cindex].device_Rule_T2,
global->GPUBagS[GPU_Cindex].device_RatioS );
//-------------------------------------------
checkCudaErrors( cudaMemcpyAsync(
global->host_Re_valueS + GPU_offset,
global->GPUBagS[GPU_Cindex].device_Re_valueS,
(n_T2/global->n_GPU)*sizeof(double),
cudaMemcpyDeviceToHost,
global->GPUBagS[GPU_Cindex].stream
) );
//-------------------------------------------
}//endfor GPU_Cindex
//-------------------------------------------
checkCudaErrors( cudaDeviceSynchronize() ); // wait for synchronize
//-------------------------------------------
return min( 1.0, 1.0/(*std::max_element( global->host_Re_valueS, global->host_Re_valueS+n_T2 )) );
}//end orm_daC
The GPU card is K20 and CUDA6.5 is used. Any thought or suggestion is appreciated. Thank you.