Hi,
I cannot achieve kernel concurrency with the code below (don’t mind if the code actually does not do anything smart - this is just a test). I have compiled with the --default-stream per-thread option, which basically sets a default stream for each host thread - feature available in CUDA 7. I have confirmed in Visual Profiler that indeed a CUDA stream is created for each host thread, but still cannot get kernels running concurrently - they seem to be serialized. I am using 16 host threads. Can anyone see the problem?
global void matoperation(unsigned char *s1, unsigned char *s2, unsigned char *d, int rows, int cols, int pitch)
{
int r = blockIdx.x * blockDim.x + threadIdx.x;
int c = blockIdx.y * blockDim.y + threadIdx.y;
for (int i = 1; i < 100000; i++)
if ((r < rows) && (c < cols))
d[r * pitch + c] = (((s1[r * pitch + c] / 3 + s2[r * pitch + c] / 3) / 7 + s2[r * pitch + c]
- s1[r * pitch + c]) * s1[r * pitch + c]) / s2[r * pitch + c];
}
void DummyOperation(unsigned char *Src1, unsigned char *Src2, unsigned char *Dst, int rows, int cols, int pitch)
{
matoperation<<<1, 1>>>(Src1, Src2, Dst, rows, cols, pitch);
cudaStreamSynchronize(0);
}
// Thread function (implemented using boost library)
void Process(int ThreadID)
{
…
// Here set rows, cols and pitch
// Here allocate memory for Src1GPU, Src2GPU, DstGPU at the device using cudaMallocPitch
// to make sure that memory at the device is allocated only once when this thread starts
…
try
{
while(1)
{
boost::this_thread::interruption_point();
…
// Here fetch new matrix from somewhere in the RAM
// copy matrix to the device (NOT using async copy)
…
DummyOperation(Src1GPU, Src2GPU, DstGPU, rows, cols, pitch);
…
// here copy result from device into host matrix (NOT using async copy)
//do something with this copy…
}
} catch (boost::thread_interrupted&)
{
printf(“Process thread interrupted\n”);
}
}