Hi,
in order to compare the performance of the 2 APIs, I had to get involved to the event system of both of them. At the Moment, my time measuring looks like this:
CUDA:
cudaEventRecord(cu_lasEvents[0],cu_lsStream);
cudaEventSynchronize(cu_lasEvents[0]);
<<<kernel launch>>>
cudaEventRecord(cu_lasEvents[1],cu_lsStream);
cudaEventSynchronize(cu_lasEvents[1]);
cudaStreamSynchronize(cu_lsStream);
cudaEventElapsedTime(&(sTime->kernel), cu_lasEvents[0], cu_lasEvents[1]);
OpenCL:
clEnqueueNDRangeKernel(ocl_lsQueue, ocl_lsKernel, 1, NULL, &lulGlobalWorkSize, &lulLocalWorkSize, 0, NULL, &ocl_lasEvents[0]);
clWaitForEvents(3,ocl_lasEvents);
clGetEventProfilingInfo(ocl_lasEvents[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong),&ocl_ulStartFunction, &lulRet);
clGetEventProfilingInfo(ocl_lasEvents[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong),&ocl_ulStopKernel, &lulRet);
This is giving me pretty similar resulats (except memory interactions between host and device take a lot longer in CL)
But when I change “CL_PROFILING_COMMAND_START” to “CL_PROFILING_COMMAND_QUEUED”, it takes …for example…4ms longer.
Whats the best way to get comparable, relieable results?
Is it right to use cudaEventSynchronize on asynchronous devices with overlapping memcpy and kernel excecution?