Hi!
I have the following problem:
I wrote this code:
__host__ void CuAddVector(float *_p1, float *_p2, unsigned int _uiSize, float *_pResult, unsigned int *_ulTime)
{
unsigned int _uiStopTime, _uiStartTime;
//allocate device memory (graphics card)
float *_fCuda1, *_fCuda2, *_fCudaResult;
_uiStartTime = clock();
cudaMalloc((void**)&_fCuda1, _uiSize*sizeof(float));
cudaMalloc((void**)&_fCuda2, _uiSize*sizeof(float));
cudaMalloc((void**)&_fCudaResult, _uiSize*sizeof(float));
//copy input data from host memory (RAM) to device memory (graphics card)
cudaMemcpy(_fCuda1, _p1, _uiSize*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(_fCuda2, _p2, _uiSize*sizeof(float), cudaMemcpyHostToDevice);
//vector addition (only one thread/kernel)
globalAddVector<<<dim3(1,1,1),dim3(1,1,1)>>>(_fCuda1, _fCuda2, _uiSize, _fCudaResult);
//copy output data from device memory (graphics card) to host memory (RAM)
cudaMemcpy(_pResult, _fCudaResult, _uiSize*sizeof(float), cudaMemcpyDeviceToHost);
_uiStopTime = clock();
if (_uiStopTime >= _uiStartTime)
*_ulTime = _uiStopTime - _uiStartTime;
else
*_ulTime = _uiStopTime + (0xffffffff - _uiStopTime);
}
P1 and P2 are arrays I ant to add, uiSize is the size of both. The function globaladdvector adds them (using one thread). With the difference of uiStartTime und uiStopTime I want measure the GPU cycles. But independent of the array size (I tested values between 10 and 10000) ulTime is always about 60 - 80! What is the error?
And a second short question: By what does the function
cudaMemcpy(_pResult, _fCudaResult, _uiSize*sizeof(float), cudaMemcpyDeviceToHost);
know, that the function ‘globalAddVector’ is ready?
Please help me!