pthreads and concurrent kernels

my function use cuda:

int myfunc(...)
{
cudaStream_t stream;

cudaStreamCreate(&stream);
...

for (...)
{
    ... 
    cudaMemcpyAsync(..., cudaMemcpyDeviceToDevice, stream);
    ...
}
...
mykernel<<<1, k, 0, stream>>>(...);
cudaMemcpyAsync(..., cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);    
}

my thread code:

void* mythread(void *arg)
{
    while(1)
    {
        int x = __sync_fetch_and_add(&cnt, 1);
        if (x > TIMES)
            return NULL;

        myfunc(...);
    }
}
....
gettimeofday(...);
for (i = 0; i < THREADS; i++)
   pthread_create(...);
for (i = 0; i < THREADS; i++)
   pthread_join(...);
gettimeofday(...);

geforce 660ti
7 multicores

if THREADS = 1 then execution time 3.8s
if THREADS = 7 then execution time 3.2s
Look like kernels executed sequentially
Why threads are not using different multicores?

call cudaGetDeviceProperties(…)
concurrentKernels is set