Hi there,
First, wish all of you a peaceful and merry Christmas.^-^
Now back to my question, I have the following code :
const int MAX_NUM_ITERATIONS = 50;
const int MIN_NUM_ITERATIONS = 5;
bool running = true; // this value will be changed through keyborad events
while (running)
{
kernel1<<<NUM_BLOCKS, NUM_THREADS>>>();
cudaDeviceSynchronize ();
int num_iterations = 0;
bool continue_condition = true;
while ( (num_iterations < MIN_NUM_ITERATIONS) || (continue_condition && (num_iterations < MAX_NUM_ITERATIONS)) )
{
kernel2<<<NUM_BLOCKS, NUM_THREADS>>>(...);
cudaDeviceSynchronize ();
kernel3<<<NUM_BLOCKS, NUM_THREADS>>>(...);
cudaDeviceSynchronize ();
kernel4<<<NUM_BLOCKS, NUM_THREADS>>>(..., continue_condition);
cudaDeviceSynchronize ();
kernel5<<<NUM_BLOCKS, NUM_THREADS>>>(...);
cudaDeviceSynchronize ();
++num_iterations;
}
kernel6<<<NUM_BLOCKS, NUM_THREADS>>>();
cudaDeviceSynchronize ();
kernel7<<<NUM_BLOCKS, NUM_THREADS>>>();
cudaDeviceSynchronize ();
}
Here are my questions:
(1) How much benefit can i get if I use dynamic parallelism for this code on my GTX 780 ?
(2) Is the kernel launch time from device shorter than from host code?
Thanks.