Thanks for your answer!
When I said 0 ms I meant that this code in my program
GET_TIME(st);
//cst = clock();
if(is_single)
_sp->retrieve(dev_F, segment_index);
else
_sp->retrieve(dev_Fd, segment_index);
GET_TIME(et);
if (TIMING) printf("\tthread index %d: segment %d: iteration %d: memory copy time: %.3fs\n", dev_index, segment_index, k, et - st);
//cet = clock();
//if (TIMING) printf("\tthread index %d: segment %d: iteration %d: memory copy time(time.h): %d\n", dev_index, segment_index, k-1, cet - cst);
prints something like this
thread index 0: segment 0: iteration 1: memory copy time: 0.000s
Here is the retrieve() function
template<typename T>
void MemoryStorage<T>::retrieve(cudaArray* ptr)
{
CUDA_FUNCTION_CALL(cudaMemcpy2DToArray(ptr, 0, 0, this->_ptr, sizeof(T) * this->_N, sizeof(T) * this->_N, this->_segment_size / this->_N, cudaMemcpyHostToDevice));
}
I defined GET_TIME() macro as
#define GET_TIME(t) if (true){SYSTEMTIME time_t;GetSystemTime(&time_t); t = time_t.wMilliseconds / 1000.0 + time_t.wSecond + time_t.wMinute * 60.0 + time_t.wHour * 60.0 * 60.0;}
I also tried to measure time with the help of clock() function from time.h (commented out lines) and got the same results. So 0 ms means that execution time of cudaMemcpy2DToArray < 1 ms while the size of data is 256MB.
About twice faster:
the whole cycle where I execute this data transfers looks like this
while (++k != _K + 1 && !_stop){
sync_cpu_threads();
_stop = true;
CUDA_FUNCTION_CALL(cudaMemcpy(dev_x_M, _host_x_M, sizeof(T) * _M, cudaMemcpyHostToDevice));
for (int segment_index = dev_index; segment_index < _segments_count; segment_index += this->_dev_count){
__int64 x_segment_size = _sp->size(segment_index) / _M;
__int64 start_index = x_segment_size * segment_index;
GET_TIME(st);
//cst = clock();
if(is_single)
_sp->retrieve(dev_F, segment_index);
else
_sp->retrieve(dev_Fd, segment_index);
GET_TIME(et);
if (TIMING) printf("\tthread index %d: segment %d: iteration %d: memory copy time: %.3fs\n", dev_index, segment_index, k, et - st);
//cet = clock();
//if (TIMING) printf("\tthread index %d: segment %d: iteration %d: memory copy time(time.h): %d\n", dev_index, segment_index, k-1, cet - cst);
if(is_single)
CUDA_FUNCTION_CALL(cudaBindTextureToArray(texRefA, dev_F));
/*else
CUDA_FUNCTION_CALL(cudaBindTextureToArray(texRefAd, dev_F));*/
CUDA_FUNCTION_CALL(cudaMemcpy(dev_x, _host_x + start_index, sizeof(T) * x_segment_size, cudaMemcpyHostToDevice));
if (TRACE) printf("\tthread index %d: segment %d: iteration %d\n", dev_index, segment_index, k);
host_stop = 1;
CUDA_FUNCTION_CALL(cudaMemcpy(dev_stop, &host_stop, sizeof(int) * 1, cudaMemcpyHostToDevice));
//will use memset instead of memcpy
GET_TIME(st);
//cst = clock();
if(is_single)
iteration_v1<<< blocks, threads >>>((float*)dev_x, (float*)dev_x_M, (float*)dev_g, _M, start_index, (float)_epsilon, dev_stop);
else
iteration_v1d<<< blocks, threads >>>((double*)dev_x, (double*)dev_x_M, (double*)dev_g, (double*)dev_Fd, _M, start_index, (double)_epsilon, dev_stop);
e = cudaThreadSynchronize();
GET_TIME(et);
if (TIMING) printf("\tthread index %d: segment %d: iteration %d: kernel work time: %.3fs\n", dev_index, segment_index, k, et - st);
//cet = clock();
//if (TIMING) printf("\tthread index %d: segment %d: iteration %d: kernel work time(time.h): %d\n", dev_index, segment_index, k-1, cet - cst);
//e = cudaGetLastError();
if (e != cudaSuccess) {
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString(e));
exit(e);
}
CUDA_FUNCTION_CALL(cudaMemcpy(&host_stop, dev_stop, sizeof(int) * 1, cudaMemcpyDeviceToHost));
if (dev_F){
if(is_single)
cudaUnbindTexture(texRefA);
/*else
cudaUnbindTexture(texRefAd);*/
//cudaFreeArray(dev_F);
//dev_F = NULL;
}
CUDA_FUNCTION_CALL(cudaMemcpy(_host_x + start_index, dev_x, sizeof(T) * x_segment_size, cudaMemcpyDeviceToHost));
if(this->_dev_count > 1) LOCK_MUTEX(_g_mutex);
_stop &= (bool)host_stop;
if(this->_dev_count > 1) UNLOCK_MUTEX(_g_mutex);
}
if (TRACE && this->_dev_count > 1) printf("\tthread index %d: iteration done. synchronizing...\n", dev_index);
sync_cpu_threads();
if (TRACE && k < _K) printf("thread index %d: iteration %d complete (error_code %d).\n", dev_index, k, cudaGetLastError());
}
_sp object contains data devided into segment 256 MB each.
I can run this code on one of my GPUs (one work thread) or on both (two work threads). First GPU processes segment from _sp with odd indexes, second - with even indexes. When I use one GPU memory transfer time is nearly 46 ms. But when I use both my program prints out different memory transfer times - from 46 ms to 15 ms and than 0 ms.
Tomorrow when I get access to my remote computer I’ll post exact speeds on Linux.