I’m working on a ray tracer using CUDA 8.0 and the call of the kernel is:
extern "C" float CudaRender(CudaScene& scene, int w, int h, CudaVec* output)
{
printf("CudaRender\n");
fflush(stdout);
if (output == nullptr) return 0.0;
if (scene.geolist.size() == 0 && scene.spherelist.size() == 0) return 0.0;
CudaVec* dev_result = nullptr;
cudaMalloc(&dev_result, sizeof(CudaVec)*w*h);
//CudaVec* dev_result_temp = nullptr;
//cudaMalloc(&dev_result_temp, sizeof(CudaVec)*w*h*SampleNum);
float elapsed = 0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
fflush(stdout);
cudaStream_t stream[CUDA_XN*CUDA_YN];
// dev_randstates is already generated using curand_init in another kernel.
for (int i = 0; i < CUDA_XN; i++)
{
for (int j = 0; j < CUDA_YN; j++)
{
cudaStreamCreate(&stream[i + CUDA_XN * j]);
CudaMonteCarloRender << <dim3(w/CUDA_XN, h/CUDA_YN), SampleNum, 0, stream[i + CUDA_XN * j] >> > (
w/ CUDA_XN*i, h/ CUDA_YN*j,
dev_geos, scene.geolist.size(), // all the triangles and the number of triangles
dev_spheres, scene.spherelist.size(), // all the spheres and the number of spheres
scene.camera, w, h, dev_result, dev_randstates);
}
}
cudaDeviceSynchronize();
for (int i = 0; i < CUDA_XN*CUDA_YN; i++)
{
cudaStreamDestroy(stream[i]);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("The elapsed time in gpu was %.2f ms", elapsed);
fflush(stdout);
printf("Rendered\n");
fflush(stdout);
cudaMemcpy(output, dev_result, sizeof(CudaVec)*w*h, cudaMemcpyDeviceToHost);
cudaFree(dev_result);
return elapsed;
}
}
According to the programming guide, kernels should run concurrently on the device, just as the concurrentkernel sample works.But the program above doesn’t work on my computer. I use vs2015 and cuda 8.0 on a GTX1050 Ti card. I don’t know how to insert image here but in profier it looks like this:
Streeam
default
stream14 ====
stream15 ====
stream16 ====
stream17 ====
stream18 ====
stream19 ====
stream20 ====
stream21 ====
Streams are created but they don’t run in parallel. The grid size is [30, 30, 1] and block size is [8, 1, 1]. I think this is definitely not too large for my card. So why is this? I’ve read concurrentkernel sample and it works for me, but I can’t figure out waht’s wrong with this
Thanks!!.