Hello,
I develop a large scientific software package for materials science. I am currently implementing a GPU version of the code. I created a minimal GPU example which mimics the workload in the main code for the rate limiting step. This is still somewhat complex, however, as there are multiple POSIX threads and each runs its own kernel for its own job. I setup the threads to use stream concurrency with each POSIX thread using it’s default stream.
I then profiled the code using NVTX and NSIGHT Systems. I have looked at the code both with the CLI and GUI. One thing that sticks out to me is that small DtoH memory transfers are taking an outsized share of the runtime. After each kernel run there are two doubles transferred DtoH, and these take longer than 64,000 doubles transferred HtoD before the kernel run.
This is the runtime breakdown of my code from nsys --stats
Time (%) Total Time (s) Range
27.4 280 :ENERGY_OUT
25.8 264 :FENERGY_OUT
22.9 234 :CUDAMalloc
10.3 105 :KERNEL
10.1 103 :CUDAFree
2.8 28 :ELEVELS_IN
ELEVELS_IN is a 64k double HtoD transfer, ENERGY_OUT is a double DtoH transfer, and FENERGY_OUT is another double DtoH transfer.
I had assumed that the stream concurrency would have helped to queue up constant data transfers, but I see long empty periods in between transfers. I am having some difficulty understanding these results as the asynchronous calls can sometimes confound the profiling results. I have seen before that depending on where you put cudaStreamSynchronize calls it can look like one call is taking a long time when it’s actually the previous call finishing first. To mitigate this problem, I call cudaStreamSynchronize between each GPU transfer or kernel.
I would really appreciate any help or advice you can provide as this is outside of my capabilities to fix and I am stuck.
Why does it look like the small memory transfers are so slow? I assumed these would be almost negligible.
Is there is an intrinsic latency to the memory transfer?
Is there a reason that DtoH would be slower than HtoD?
Are all the calls to cudaStreamSynchronize slowing down the run?
Are there conflicts between the memory transfers on different streams?
Is there a problem with my setup of the streams so they are not as concurrent as I expect?
Is the profile inaccurate because of concurrency/asynchronous calls?
The code is given below. It is compiled with:
nvcc my_program.cu -lm -lpthread --default-stream per-thread -o my_program
A typical run is:
my_program 256 1 1024 1 1024 64000 27
The arguments here are:
256 - POSIX threads
1 - block (don't change this)
1024 - threads per block
1 - run_mask (1 is GPU, 2 is CPU, 3 is both)
1024 - repeats
64,000 - vector size
27 - Memory Transfer Mask (There are 5 memory transfers available, the value of 27 produces the 3 transfers mentioned in this post)
Once again, thank you for your help and let me know if there is any other information I can provide that would be of assistance.
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <pthread.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <nvtx3/nvToolsExt.h>
enum MEMCPY_BITMASK {ELEVELS_IN, FENERGY_IN, WEIGHTS_OUT, FENERGY_OUT, ENERGY_OUT};
__device__ void fermiIteration(double *energyLevels, double fermiGuess, double temperature, int numEnergyLevels, double *weights, double *energyWeights, double *weightSum)
{
double threadEnergy=0;//Sum for each thread
double threadWeight=0;
int iLevel;
for(iLevel = threadIdx.x;iLevel < numEnergyLevels;iLevel+=blockDim.x)
{
weights[iLevel] = 1.0/(1.0+exp((*(energyLevels+iLevel)-fermiGuess)/temperature));
threadEnergy +=weights[iLevel] * energyLevels[iLevel];
threadWeight += weights[iLevel];
}
energyWeights[threadIdx.x] = threadEnergy; //Store sum from each thread at base value
weightSum[threadIdx.x] = threadWeight; //Store sum from each thread at base value
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1)
{
__syncthreads();
if(threadIdx.x < stride)
{
weightSum[threadIdx.x ] += weightSum[threadIdx.x+stride];
energyWeights[threadIdx.x] += energyWeights[threadIdx.x+stride];
}
}
__syncthreads();
}
__global__ void fermiKernel(double *energyLevels, double *fermiEnergy, double currentExcitations, double fermiConvergence, double temperature, int numEnergyLevels, double *weights, double *energyWeights, double *d_energy, double *weightSum)
{
double fermiGuess = *fermiEnergy;
double fermiMin = fermiGuess - 7.5*fermiConvergence;
double fermiMax = fermiGuess + 8.49*fermiConvergence;//This is slightly offset so that the fermi level could be almost unchanged.
double totalCarriersMin=0;
double totalCarriersMax=0;
double totalCarriers;
//Try to bound the fermi level within a small box around the previous result which requires not more than 6 iterations to converge.
fermiIteration(energyLevels, fermiMin, temperature, numEnergyLevels, weights, energyWeights, weightSum);
totalCarriersMin = *weightSum;
fermiIteration(energyLevels, fermiMax, temperature, numEnergyLevels, weights, energyWeights, weightSum);
totalCarriersMax = *weightSum;
if((totalCarriersMin > currentExcitations) || (totalCarriersMax < currentExcitations))//If the solution is not within the bounds expand the bounds.
{
//printf("Fermi level outside bounds, Min %.10g max %.10g, min carriers %.10g, max carriers %.10g, target carriers %g\n",fermiMin,fermiMax,totalCarriersMin,totalCarriersMax,currentExcitations);
fermiMin = optics_llimit;
fermiMax = optics_ulimit;
}
while(fermiMax - fermiMin > fermiConvergence)
{
fermiGuess = (fermiMax+fermiMin)/2.0;
__syncthreads();
fermiIteration(energyLevels, fermiGuess, temperature, numEnergyLevels, weights, energyWeights, weightSum);
totalCarriers = *weightSum;
if(totalCarriers > currentExcitations) //Lower search range
fermiMax = fermiGuess;
else
fermiMin = fermiGuess;
//printf("Fermi range %g %g Carriers %g %g\n",fermiMin,fermiMax,totalCarriers,currentExcitations);
__syncthreads();
}
if(threadIdx.x == 0)
{
*fermiEnergy = fermiGuess;
*d_energy = *energyWeights;
}
}
void fermiSum( double *energyLevels, double fermiEnergy, double temperature, int numEnergyLevels, double *weights, double *weightSum, double *energySum)
{
/*
Evaluates the fermi Sum, N=sum(fi) and E=sum(fi * Ei)
where fi is the fermi function fi = 1/(1+exp((Ei-Ef)/kT))
Inputs are:
energyLevels (Ei)
fermiEnergy (Ef)
temperature (kT)
numEnergyLevels (n)
outputs are:
weights (fi)
weightSum (N)
energySum (E)
*/
*weightSum=0;
*energySum=0;
for(int i = 0;i<numEnergyLevels;i++)
{
*(weights+i) = 1.0/(1.0+exp((*(energyLevels+i)-fermiEnergy)/temperature));
*weightSum += *(weights+i);
*energySum += *(energyLevels+i) * *(weights+i);
}
}
double solveFermiEnergy(double currentExcitations, double fermiGuess, double fermiConvergence, double *energyLevels,double *weights, int numEnergyLevels, double temperature, double *energy)
{
double fermiMin = fermiGuess - 7.5*fermiConvergence;
double fermiMax = fermiGuess + 8.49*fermiConvergence;//This is slightly offset so that the fermi level could be almost unchanged.
double totalCarriersMin=0;
double totalCarriersMax=0;
//Try to bound the fermi level within a small box around the previous result which requires not more than 6 iterations to converge.
fermiSum( energyLevels, fermiMin, temperature, numEnergyLevels, weights, &totalCarriersMin, energy);
fermiSum( energyLevels, fermiMax, temperature, numEnergyLevels, weights, &totalCarriersMax, energy);
if((totalCarriersMin > currentExcitations) || (totalCarriersMax < currentExcitations))//If the solution is not within the bounds expand the bounds.
{
//printf("Fermi level outside bounds, Min %.10g max %.10g, min carriers %.10g, max carriers %.10g, target carriers %g\n",fermiMin,fermiMax,totalCarriersMin,totalCarriersMax,currentExcitations);
fermiMin = optics_llimit;
fermiMax = optics_ulimit;
}
double totalCarriers=0;
while(fermiMax-fermiMin > fermiConvergence)
{
fermiGuess = (fermiMax+fermiMin)/2.0;
fermiSum( energyLevels, fermiGuess, temperature, numEnergyLevels, weights, &totalCarriers, energy);
if(totalCarriers > currentExcitations) //Lower search range
fermiMax = fermiGuess;
else
fermiMin = fermiGuess;
//printf("Fermi range %g %g Carriers %g %g\n",fermiMin,fermiMax,totalCarriers,currentExcitations);
}
return fermiGuess;
}
double RandDouble(double low, double high, unsigned int *state) {
double t = (double)rand_r(state) / (double)RAND_MAX;
return (1.0f - t) * low + t * high;
}
int num_pthreads = 256;
// Number of elements per vector; arbitrary,
int ELEMENT_N = 64000;
// Total number of data elements
int sDATA_SZ;
int num_blocks = 1;
int num_threads = 128;
int run_mask = 3;
int repeats = 1;
int memcpy_mask = 31;
double fermiConvergence = 1e-7;
void * single_thread(void *data)
{
int thread_number = *((int *)data);
int num_gpu;
cudaGetDeviceCount(&num_gpu);
cudaSetDevice((double)thread_number/num_pthreads*num_gpu);
double *h_energyLevels, *h_weights, *weights_CPU, fermiEnergy_CPU, h_fermiEnergy, h_energy, energy_CPU;
double *d_energyLevels, *d_weights, *d_energyWeights, *d_weightSum, *d_fermiEnergy, *d_energy;
int i,iRepeat;
int saccum_size = num_threads*sizeof(double);
if(thread_number==0)
printf("Initializing data...\n");
if(thread_number==0)
printf("...allocating CPU memory.\n");
h_energyLevels = (double *)malloc(sDATA_SZ);
h_weights = (double *)malloc(sDATA_SZ);
weights_CPU = (double *)malloc(sDATA_SZ);
if(thread_number==0)
printf("...allocating GPU memory.\n");
nvtxRangePushA(":CUDAMalloc");
cudaMalloc((void **)&d_energyLevels, sDATA_SZ);
cudaMalloc((void **)&d_weights, sDATA_SZ);
cudaMalloc((void **)&d_energyWeights, saccum_size);
cudaMalloc((void **)&d_energy, sizeof(double));
cudaMalloc((void **)&d_weightSum, saccum_size);
cudaMalloc((void **)&d_fermiEnergy, sizeof(double));
cudaStreamSynchronize(0);//Finish Mallocs before starting Memcpy
nvtxRangePop();
if(thread_number==0)
printf("...initializing data.\n");
unsigned int state;
for (i = 0; i < ELEMENT_N; i++)
{
h_energyLevels[i] = RandDouble(0.0f, 1.0f,&state);
}
h_fermiEnergy = 0.5;
if((run_mask / 2) % 2 == 1)
{
if(thread_number==0)
printf("...Beginning CPU calculation.\n");
for(iRepeat=0;iRepeat<repeats;iRepeat++)
{
fermiEnergy_CPU = solveFermiEnergy( ELEMENT_N/2, fermiEnergy_CPU, fermiConvergence, h_energyLevels,weights_CPU,ELEMENT_N,0.025,&energy_CPU);
}
if(thread_number==0)
printf("Fermi Level from CPU is %g energy %g\n",fermiEnergy_CPU,energy_CPU);
}
if ((run_mask % 2 == 1))
{
if(thread_number==0)
printf("...copying input data to GPU mem.\n");
// Copy data to GPU memory for processing
cudaMemcpyAsync(d_energyLevels, h_energyLevels, sDATA_SZ, cudaMemcpyHostToDevice,0);
cudaMemcpyAsync(d_fermiEnergy, &h_fermiEnergy, sizeof(double), cudaMemcpyHostToDevice,0);
cudaStreamSynchronize(0);
if(thread_number==0)
printf("...Running GPU Kernel.\n");
for(iRepeat=0;iRepeat<repeats;iRepeat++)
{
nvtxRangePushA(":ELEVELS_IN");
if((memcpy_mask >> ELEVELS_IN) %2)
cudaMemcpyAsync(d_energyLevels, h_energyLevels, sDATA_SZ, cudaMemcpyHostToDevice,0);
cudaStreamSynchronize(0);
nvtxRangePop();
nvtxRangePushA(":FENERGY_IN");
if((memcpy_mask >> FENERGY_IN) %2)
cudaMemcpyAsync(d_fermiEnergy, &h_fermiEnergy, sizeof(double), cudaMemcpyHostToDevice,0);
cudaStreamSynchronize(0);
nvtxRangePop();
nvtxRangePushA(":KERNEL");
fermiKernel<<<1, num_threads,0,0>>>(d_energyLevels, d_fermiEnergy, ELEMENT_N/2, fermiConvergence , 0.025, ELEMENT_N, d_weights,d_energyWeights,d_energy,d_weightSum);
cudaStreamSynchronize(0);
nvtxRangePop();
nvtxRangePushA(":WEIGHTS_OUT");
if((memcpy_mask >> WEIGHTS_OUT) %2)
cudaMemcpyAsync(h_weights, d_weights, sDATA_SZ, cudaMemcpyDeviceToHost,0);
cudaStreamSynchronize(0);
nvtxRangePop();
nvtxRangePushA(":FENERGY_OUT");
if((memcpy_mask >> FENERGY_OUT) %2)
cudaMemcpyAsync(&h_fermiEnergy, d_fermiEnergy, sizeof(double), cudaMemcpyDeviceToHost,0);
cudaStreamSynchronize(0);
nvtxRangePop();
nvtxRangePushA( ":ENERGY_OUT");
if((memcpy_mask >> ENERGY_OUT) %2)
cudaMemcpyAsync(&h_energy, d_energyWeights, sizeof(double), cudaMemcpyDeviceToHost,0);
cudaStreamSynchronize(0);
nvtxRangePop();
}
cudaMemcpyAsync(h_weights, d_weights, sDATA_SZ, cudaMemcpyDeviceToHost,0);
cudaMemcpyAsync(&h_fermiEnergy, d_fermiEnergy, sizeof(double), cudaMemcpyDeviceToHost,0);
cudaMemcpyAsync(&h_energy, d_energyWeights, sizeof(double), cudaMemcpyDeviceToHost,0);
cudaStreamSynchronize(0);
if(thread_number==0)
printf("Fermi Level GPU is %g Energy GPU is %g\n",h_fermiEnergy,h_energy);
cudaStreamSynchronize(0);//Finish Memcpys before Cuda_free
}
if(thread_number==0)
printf("...freeing GPU memory.\n");
nvtxRangePushA(":CUDAFree");
cudaFree(d_energyLevels);
cudaFree(d_weights);
cudaFree(d_energyWeights);
cudaFree(d_weightSum);
cudaFree(d_fermiEnergy);
cudaFree(d_energy);
cudaStreamSynchronize(0);
nvtxRangePop();
return 0;
}
int main(int argc, char **argv)
{
if (argc == 8)
{
num_pthreads = atoi(argv[1]);
num_blocks = atoi(argv[2]);
num_threads = atoi(argv[3]);
run_mask = atoi(argv[4]);
repeats = atoi(argv[5]);
ELEMENT_N = atoi(argv[6]);
memcpy_mask = atoi(argv[7]);
}
else
printf("Usage: distribution_GPU num_pthreads num_blocks num_threads run_mask repeats elements memcpy_mask[0-31]\n");
printf("Running with %d pthreads %d blocks %d threads, %d run_mask and %d repeats %d elements\n",num_pthreads,num_blocks,num_threads,run_mask,repeats,ELEMENT_N);
if((memcpy_mask >> ELEVELS_IN) %2)
printf("Copying energy levels in\n");
if((memcpy_mask >> FENERGY_IN) %2)
printf("Copying fermi energy in\n");
if((memcpy_mask >> WEIGHTS_OUT) %2)
printf("Copying weights out\n");
if((memcpy_mask >> FENERGY_OUT) %2)
printf("Copying fermi energy out\n");
if((memcpy_mask >> ENERGY_OUT) %2)
printf("Copying energy out\n");
printf("%s Starting...\n\n", argv[0]);
sDATA_SZ = ELEMENT_N * sizeof(double);
int iThread = 0;
int *threadNumber = (int *)malloc(num_pthreads*sizeof(int));
void *retval;
pthread_t *threads = (pthread_t *)malloc(num_pthreads*sizeof(pthread_t));
for(iThread=0;iThread<num_pthreads;iThread++)
{
*(threadNumber+iThread) = iThread;
pthread_create(threads+iThread,NULL,(void *(*)(void *))single_thread,(void *)(threadNumber+iThread));
}
for(iThread=0;iThread<num_pthreads;iThread++)
{
pthread_join(*(threads+iThread),&retval);
}
}