my function use cuda:
int myfunc(...)
{
cudaStream_t stream;
cudaStreamCreate(&stream);
...
for (...)
{
...
cudaMemcpyAsync(..., cudaMemcpyDeviceToDevice, stream);
...
}
...
mykernel<<<1, k, 0, stream>>>(...);
cudaMemcpyAsync(..., cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
}
my thread code:
void* mythread(void *arg)
{
while(1)
{
int x = __sync_fetch_and_add(&cnt, 1);
if (x > TIMES)
return NULL;
myfunc(...);
}
}
....
gettimeofday(...);
for (i = 0; i < THREADS; i++)
pthread_create(...);
for (i = 0; i < THREADS; i++)
pthread_join(...);
gettimeofday(...);
geforce 660ti
7 multicores
if THREADS = 1 then execution time 3.8s
if THREADS = 7 then execution time 3.2s
Look like kernels executed sequentially
Why threads are not using different multicores?
call cudaGetDeviceProperties(…)
concurrentKernels is set