I was also doing my own tests, when I found this. I’m using a Core 2 Duo 2.4GHz with a NV80 card from EVGA and an Asus P5LD2-VM (Intel 945G). I am running SUSE 10.1 with kernel 2.6.16.27-0.9-smp.
I got 357.86ms for the test, which means about 1.4GB/sec. Not so much considering the 4 GB/s in each direction specified by the standard … I wonder where is the limitation …
Strange things came up when I start playing a little with the code. First, the speed seems to depend on if the data is initialized or not. Second, host → device is faster than device → host
Not inited (like in the test posted above):
(host → device): 356.212006 = 1.403658GB/s
(device → host): 958.921021 = 0.521419GB/s
Inited (all to 1.0 or something)
(host → device): 396.584991 = 1.260764GB/s
(device → host): 506.686005 = 0.986804GB/s
Why would that be?
I also computed the latency:
(host → device): 3209.617920 = 12.24us
(device → host): 3114.749023 = 11.885us
Is there anyone else who computed it? Of course, not so important as the bandwidth but … it would be nice to know.
Regards,
Serban
The code:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#include <string.h>
#define DATA_WIDTH 1024*256
#define DATA_HEIGHT 1024
#define VALUE 1.0
#define FORMAT short
void runTest_bandwidth(FORMAT* h_data, int width, int height)
{
CUT_CHECK_DEVICE();
unsigned int hTimer;
const unsigned int mem_size = sizeof(FORMAT) * width * height;
// Allocate on device
FORMAT* d_data;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_data, mem_size));
// Run and time
cutCreateTimer(&hTimer);
cutStartTimer(hTimer);
CUDA_SAFE_CALL(cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice));
cutStopTimer(hTimer);
printf(“Bandwidth test (host → device): %f = %fGB/s\n”, cutGetTimerValue(hTimer), 1000.0 * mem_size / (1024.0 * 1024.0 * 1024.0 * cutGetTimerValue(hTimer)));
//Time again backwords
CUT_CHECK_ERROR(“Kernel execution failed”);
cutCreateTimer(&hTimer);
cutStartTimer(hTimer);
CUDA_SAFE_CALL(cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL(cudaFree(d_data));
cutStopTimer(hTimer);
printf(“Bandwidth test (device → host): %f = %fGB/s\n”, cutGetTimerValue(hTimer), 1000.0 * mem_size / (1024.0 * 1024.0 * 1024.0 * cutGetTimerValue(hTimer)));
CUDA_SAFE_CALL(cudaFree(d_data));
// Check
double sum = 0;
for (long i = 0; i < width * height; i++)
sum += h_data[i];
printf(“CRC: %f\n”, sum - VALUE * width * height);
}
void runTest_latency(FORMAT* h_data, int width)
{
CUT_CHECK_DEVICE();
unsigned int hTimer;
// Allocate on device
const unsigned int mem_size = sizeof(FORMAT) * 1;
FORMAT* d_data;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_data, mem_size));
// Run and time
cutCreateTimer(&hTimer);
cutStartTimer(hTimer);
for (long i = 0; i < width; i++)
CUDA_SAFE_CALL(cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice));
cutStopTimer(hTimer);
printf("Latency test (host -> device): %f = %fus\n", cutGetTimerValue(hTimer), 1000 * cutGetTimerValue(hTimer) / (width));
cutCreateTimer(&hTimer);
cutStartTimer(hTimer);
for (long i = 0; i < width; i++)
CUDA_SAFE_CALL(cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost));
cutStopTimer(hTimer);
printf("Latency test (device -> host): %f = %fus\n", cutGetTimerValue(hTimer), 1000 * cutGetTimerValue(hTimer) / (width));
CUDA_SAFE_CALL(cudaFree(d_data));
}
int main(int argc, char* argv)
{
// Declare and allocate
FORMAT *h_vector;
h_vector = new FORMAT[DATA_WIDTH * DATA_HEIGHT];
// Init
for (long i = 0; i < DATA_WIDTH * DATA_HEIGHT; i++)
h_vector[i] = VALUE;
// Run tests
runTest_bandwidth(h_vector, DATA_WIDTH, DATA_HEIGHT);
runTest_latency(h_vector, DATA_WIDTH);
delete h_vector;
CUT_EXIT(argc, argv);
}