Hello,
I have a structure which is managed by the GPU and and each object of the structure carries out its operations in its own stream, which is a member variable. The goal is to allow operations performed on different objects to execute concurrently because they are logically independent.
I have two versions of the insert() method:
The first takes (page-locked) input and passes it to the kernel by issuing a memcpyasync(). Calls to this method from different objects are not able to execute concurrently and I have no idea why.
__host__ void insert(Key *key, Type *value, size_t k) //key, value are allocated using pinned memory
{
//Dynamically allocated shared memory size in bytes (per Block)
size_t smem_size=2*MAX_KEYS*MAX_HEIGHT*sizeof(Node<Key,Type>*);
cudaError_t error;
error=cudaMemcpyAsync((void*)dev_keys, (void*) key, k*sizeof(Key),cudaMemcpyHostToDevice, stream);
error=cudaMemcpyAsync((void*)dev_values, (void*) value, k*sizeof(Type),cudaMemcpyHostToDevice, stream);
//Launch Kernel
multi_insert_kernel<<<blocksPerGrid,threadsPerBlock,smem_size,stream>>>(dev_keys, dev_values, k, prms);
}
The second version takes a pointer to mapped memory. It executes concurrently on the device.
__host__ void insertM(Key *key, Type *value, size_t k) //key, value are mapped to device
{
//Dynamically allocated shared memory size in bytes (per Block)
size_t smem_size=2*MAX_KEYS*MAX_HEIGHT*sizeof(Node<Key,Type>*);
//Launch Kernel
multi_insert_kernel<<<blocksPerGrid,threadsPerBlock,smem_size,stream>>>(key, value, k, prms);
}
Calls are issued like this:
for(unsigned long i=0; i<=(samples-increment);i+=increment)
{
gpulist1.insert(a+i,a+i,increment);
gpulist2.insert(a+i,a+i,increment);
gpulist3.insert(a+i,a+i,increment);
gpulist4.insert(a+i,a+i,increment);
}
for(unsigned long i=0; i<=(samples-increment);i+=increment)
{
gpulist1.insertM(d+i,d+i,increment);
gpulist2.insertM(d+i,d+i,increment);
gpulist3.insertM(d+i,d+i,increment);
gpulist4.insertM(d+i,d+i,increment);
}
The device is a GTX 560ti (compute 2.1). I am able to tell whether the kernel is being executed concurrently by timing the execution times.
Does anyone have any suggestions?
Thank you.