Hi, sorry for the delay
I am not working for high speed trading but in biomedical imaging. My original project was an improvement of performances for ray tracing algorithm.
I am working on Linux and with a Tesla M2050
Call of the kernel + the cudaMemcpy :
HANDLE_ERROR(cudaMemcpyToSymbol(aMainEntryPoint,
aP1,
3*sizeof(float))) ;
HANDLE_ERROR(cudaMemcpyToSymbol(aMainExitPoint,
aP2,
3*sizeof(float))) ;
TestLatency<<<Blocks,Threads>>> (Axis0,
Axis1,
Axis2,
pCudaData->aDeviceVoxelList,
pCudaData->aDeviceDim ,
RoundingRequired,
minPoint,
maxPoint ) ;
HANDLE_ERROR(cudaMemcpy(aHostVoxelList,
pCudaData->aDeviceVoxelList,
NbVoxel*sizeof(strucVoxel),
cudaMemcpyDeviceToHost)) ;
HANDLE_ERROR(cudaEventRecord(CudaStop,0));
HANDLE_ERROR(cudaEventSynchronize(CudaStop)) ;
HANDLE_ERROR(cudaEventElapsedTime(&CudaTimeSpent,CudaStart,CudaStop)) ;
cerr << "Execution GPU time = " << CudaTimeSpent << " (ms) " << endl ;
/* free cuda memory */
HANDLE_ERROR(cudaEventDestroy(CudaStart));
HANDLE_ERROR(cudaEventDestroy(CudaStop));
Kernel code :
__global__ void TestLatency (unsigned int Axis0,
unsigned int Axis1,
unsigned int Axis2,
strucVoxel * pVoxel ,
unsigned int aDim[3] ,
bool RoundingRequired,
float minPoint,
float maxPoint ) {
}
The allocation/ free of device buffer is done before and after time measurement.
I have done an hundred of shouting and I have an average execution time of 0.05 ms
If you need other information let me know.
Regards