I have three different and independent algorithms written in CUDA. Each algorithm is a series of kernels, but the kernels do not fully use the processing power of the latest GPU. I would like to run the 3 algorithms in parallel using 3 OpenMP threads each controlling one stream of the GPU. The three thread would fill 3 hyper-Q queues in parallel and the GPU would be able to schedule the different kernels in an optimal fashion in order to maximize its use. The code below illustrate my problem. in this example, kernel_a, kernel_b, etc. are just dummy kernel that uses processing power. This code execute well on my GTX680, but I am looking at purchasing a GTX Titan to benefit from Hyper-Q.

My questions are:

- Do you see any problem with my idea?
- Can different OpenMP thread access a single GPU? (my code execute fine, but could I run into problems)
- Do I need a GTX-Titan or a Tesla-K20 for hyper-Q with OpenMP (no MPI here)?
- Do I need to protect every kernel call with a “#pragma omp ctitical” statement?

```
#include <omp.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 100000
__global__ void kernel_a()
{
float sum = 0.0;
for(int i=0; i<N; i++)
{
sum = sum + cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1);
}
}
__global__ void kernel_b()
{
float sum = 0.0;
for(int i=0; i<N/2; i++)
{
sum = sum + cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1);
}
}
__global__ void kernel_c()
{
float sum = 0.0;
for(int i=0; i<N/4; i++)
{
sum = sum + cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1);
}
}
__global__ void kernel_d()
{
float sum = 0.0;
for(int i=0; i<N/8; i++)
{
sum = sum + cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1)*cos(0.1);
}
}
int main()
{
int n_streams = 32;
int n_threads = 4;
cudaError_t cudaStatus;
// Allocate and initialize an array of stream handles
cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof(cudaStream_t));
for (int i = 0 ; i < n_streams ; i++)
{
cudaStreamCreate(&(streams[i]));
}
// Parallel section
#pragma omp parallel num_threads(n_threads)
{
for (int i=omp_get_thread_num(); i<n_streams; i=i+n_threads)
{
kernel_a<<<1,1,0,streams[i]>>>();
kernel_b<<<1,1,0,streams[i]>>>();
kernel_c<<<1,1,0,streams[i]>>>();
kernel_d<<<1,1,0,streams[i]>>>();
}
}
// release all stream
for (int i = 0 ; i < n_streams ; i++)
{
cudaStreamDestroy(streams[i]);
}
free(streams);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
return 0;
}
```