hi,
I’m benchmarking my GTX750, I wrote some code to test PCI-e bandwidth, but I have incredible results.
Data size is about 68MB, so 6ms means 11GB/s it’s the normal bandwidth of PCI-e 3.0.
My program’s output (milliseconds) :
time full1 : 5.98417
time full2 : 5.91689
time 10008 : 5.86158
time 1000 : 0.636335
time 100 : 0.132855
time 50 : 0.32672
time 10 : 4.69325
time 1 : 321.036
Here my test code, StartCounter and GetCounter work well, there is no time measuring error :
#include <windows.h>
double PCFreq = 0.0;
__int64 CounterStart = 0;
void * cuda_test1(){
uchar * dst;
uchar * data;
int const height = 10008;
int const width = 7092;
size_t size = sizeof(uchar) * height * width;
size_t packet;
int i, lines;
data = (uchar*)malloc(size);
cudaMalloc(&dst, size);
int err = cudaHostRegister(data, size, cudaHostRegisterDefault);
if (err != cudaSuccess)
return NULL;
StartCounter();
cudaMemcpy((void*)dst, (void*)data, size, cudaMemcpyHostToDevice);
std::cout << "time full1 : " << GetCounter() << std::endl;
StartCounter();
cudaMemcpy((void*)dst, (void*)data, size, cudaMemcpyHostToDevice);
std::cout << "time full2 : " << GetCounter() << std::endl;
//-----------
lines = height;
packet = (size_t)lines * width;
StartCounter();
for (i = 0; i < height; i += lines)
{
cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
}
std::cout << "time " << lines << " : " << GetCounter() << std::endl;
//-----------
lines = 1000;
packet = (size_t)lines * width;
StartCounter();
for (i = 0; i < height; i += lines)
{
cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
}
std::cout << "time " << lines << " : " << GetCounter() << std::endl;
//-----------
lines = 100;
packet = (size_t)lines * width;
StartCounter();
for (i = 0; i < height; i += lines)
{
cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
}
std::cout << "time " << lines << " : " << GetCounter() << std::endl;
//-----------
lines = 50;
packet = (size_t)lines * width;
StartCounter();
for (i = 0; i < height; i += lines)
{
cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
}
std::cout << "time " << lines << " : " << GetCounter() << std::endl;
//-----------
lines = 10;
packet = (size_t)lines * width;
StartCounter();
for (i = 0; i < height; i += lines)
{
cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
}
std::cout << "time " << lines << " : " << GetCounter() << std::endl;
//-----------
lines = 1;
packet = (size_t)lines * width;
StartCounter();
for (i = 0; i < height; i += lines)
{
cudaMemcpy(dst + i*packet, data + i*packet, packet, cudaMemcpyHostToDevice);
}
std::cout << "time " << lines << " : " << GetCounter() << std::endl;
return dst;
}
void StartCounter()
{
LARGE_INTEGER li;
if (!QueryPerformanceFrequency(&li))
std::cout << "QueryPerformanceFrequency failed!\n";
PCFreq = double(li.QuadPart) / 1000.0;
QueryPerformanceCounter(&li);
CounterStart = li.QuadPart;
}
double GetCounter()
{
LARGE_INTEGER li;
QueryPerformanceCounter(&li);
return double(li.QuadPart - CounterStart) / PCFreq;
}
As memory copies are not asynchronous, I don’t understand why I get these very short times.
Anybody has an explanation of these results ?
Thanks