We’ve just started to look what performance gains we could get from using cuda and the initial results is somewhat dull…
When copying a array to and from the device the average execution time for the “calc”-method is 0.24 ms for 10000 tries.
The size of the data is 320x256. Is this a reasonable time or have I missed something? Source code is provided below:
__global__ void calc(double* input,double* output){
}
extern "C" __declspec(dllexport) void preAlloc(int rows,int columns,void** cuDataPtr,void** cuResultPtr,int* pitch){
cudaMallocPitch(cuDataPtr,(size_t *)pitch,(size_t)(rows*sizeof(double)),(size_t)columns);
cudaMallocPitch(cuResultPtr,(size_t *)pitch,(size_t)(rows*sizeof(double)),(size_t)columns);
}
extern "C" __declspec(dllexport) void calc(void* cuDataPtr,double* input,void* cuResultPtr,double* output,int rows,int columns,int pitch){
cudaMemcpy2D(cuDataPtr,(size_t)pitch,input,(size_t)pitch,(size_t)rows,(size_t)columns,cudaMemcpyHostToDevice);
cudaMemcpy2D(output,(size_t)pitch,cuResultPtr,(size_t)pitch,(size_t)rows,(size_t)columns,cudaMemcpyDeviceToHost);
}
0.24 ms seems like allot of time for just copy to and from the device on 640k of data using a x260-card?