Hello,
I’m wondering about performance implications of various synchronization methods. To be precise, say you have two operations which need to be performed repeatedly and in succession with each one depending on the entire set of results from the previous one as follows…
void worker(,,,)
{
int i;
int n; /* total number of iterations, say 1e6. */
for(i=0; i<n; i++) {
operation1(,,,);
operation2(,,,);
}
}
As far as I can understand there are numerous ways to do this using CUDA.
A: Using synchronous kernel launches… [EDIT: Not synchronous, my mistake.]
__global__ void operation1(,,,)
{
operation1_per_element(,,,);
}
__global__ void operation2(,,,)
{
operation2_per_element(,,,);
}
void worker(,,,)
{
int i;
int n; /* total number of iterations, say 1e6. */
for(i=0; i<n; i++) {
operation1<<<dimGrid, dimBlock>>>(,,,);
operation2<<<dimGrid, dimBlock>>>(,,,);
}
}
B: Using async kernel launches. Useful if you need to overlay device-host transfers.
__global__ void operation1(,,,)
{
operation1_per_element(,,,);
}
__global__ void operation2(,,,)
{
operation2_per_element(,,,);
}
void worker(,,,)
{
int i;
int n; /* total number of iterations, say 1e6. */
cudaStream_t stream[N_STREAMS]; /* arbitrary number of streams. */
for (i=0; i<N_STREAMS; i++)
cudaStreamCreate(&stream[i]);
for(i=0; i<n; i++) {
operation1<<<dimGrid, dimBlock, 0, stream[1]>>>(,,,);
operation2<<<dimGrid, dimBlock, 0, stream[1]>>>(,,,);
}
}
C: As B but with cudaStreamSynchronize in order to prevent the for loop running away on the CPU.
__global__ void operation1(,,,)
{
operation1_per_element(,,,);
}
__global__ void operation2(,,,)
{
operation2_per_element(,,,);
}
void worker(,,,)
{
int i;
int n; /* total number of iterations, say 1e6. */
cudaStream_t stream[N_STREAMS]; /* arbitrary number of streams. */
for (i=0; i<N_STREAMS; i++)
cudaStreamCreate(&stream[i]);
for(i=0; i<n; i++) {
operation1<<<dimGrid, dimBlock, 0, stream[1]>>>(,,,);
operation2<<<dimGrid, dimBlock, 0, stream[1]>>>(,,,);
cudaStreamSynchronize(stream[1]);
}
}
D: As C but with less overall cudaStreamSynchronize…
__global__ void operation1(,,,)
{
operation1_per_element(,,,);
}
__global__ void operation2(,,,)
{
operation2_per_element(,,,);
}
void worker(,,,)
{
int i;
int n; /* total number of iterations, say 1e6. */
cudaStream_t stream[N_STREAMS]; /* arbitrary number of streams. */
for (i=0; i<N_STREAMS; i++)
cudaStreamCreate(&stream[i]);
for(i=0; i<(n/4); i++) {
operation1<<<dimGrid, dimBlock, 0, stream[1]>>>(,,,);
operation2<<<dimGrid, dimBlock, 0, stream[1]>>>(,,,);
operation1<<<dimGrid, dimBlock, 0, stream[1]>>>(,,,);
operation2<<<dimGrid, dimBlock, 0, stream[1]>>>(,,,);
operation1<<<dimGrid, dimBlock, 0, stream[1]>>>(,,,);
operation2<<<dimGrid, dimBlock, 0, stream[1]>>>(,,,);
operation1<<<dimGrid, dimBlock, 0, stream[1]>>>(,,,);
operation2<<<dimGrid, dimBlock, 0, stream[1]>>>(,,,);
cudaStreamSynchronize(stream[1]);
}
}
E: Looping within the kernel instead. Having some sort of monolythic kernel and using threadfence… [EDIT: Doesn’t work. See replies.]
__global__ void monolythic_kernel(int n,,,)
{
int i;
for(i=0; i<n; i++) {
operation1_per_element(,,,);
__threadfence();
operation2_per_element(,,,);
__threadfence();
}
}
void worker(,,,)
{
int i;
int n; /* total number of iterations, say 1e6. */
monolythic_kernel<<<dimGrid, dimBlock, 0, stream[1]>>>(n,,,);
}
NOTE: Obviously all incomplete code…
Section 2.1.1 of “CUDA C Best Practices Guide Version 3.2”
I understand that synchronization points such as these cause the thread scheduler to stall and thus stop being able to hide memory access latency.
MY QUESTION IS THIS:
Does synchronizing to the host (CPU) using cudaStreamSynchronize, cudaThreadynchronize or synchronous launches have any more performance impact than launching a sequence of kernels in the same stream asynchronously. In other words, what is better? B, C or D?
Rephrasing yet again; Is it any worse to synchronize to the host AND launch a new kernel asynchronously than to just launch the new kernel without syncing to the host?
Looking forward to hearing your responses…