I just started with CUDA development and wrote a simple app to test performance.
I tried to compare both of my GPUs to see the difference and notice that my GTX 980ti seems to outperform the RTX 2080ti which is weird… Am I doing something wrong here or what could cause this behavior?
Here’s the code I use to test this:
__global__ void device_add(long* a, long* b, long* c) {
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}
void run_on_device(int idx, long* a, long*b, long* c) {
cudaSetDevice(idx);
auto start = std::chrono::high_resolution_clock::now();
long size = N * sizeof(long);
long* da, * db, * dc;
cudaMalloc(&da, size);
cudaMalloc(&db, size);
cudaMalloc(&dc, size);
cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);
for (int i = 0; i < 10; i++) {
device_add << <1, N >> > (da, db, dc);
}
cudaMemcpy(c, dc, size, cudaMemcpyDeviceToHost);
auto elapsed = std::chrono::high_resolution_clock::now() - start;
long long microseconds = std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
printf("Device %d time: %d\n",idx, microseconds);
cudaFree(da); cudaFree(db); cudaFree(dc);
}
int main(void) {
cudaDeviceProp* prop = new cudaDeviceProp();
cudaGetDeviceProperties(prop, 0);
printf("Device 0: %s\n", prop->name);
cudaGetDeviceProperties(prop, 1);
printf("Device 1: %s\n", prop->name);
long* a, * b, * c;
long size = N * sizeof(long);
auto start = std::chrono::high_resolution_clock::now();
a = new long[N]; fill_array(a);
b = new long[N]; fill_array(b);
c = new long[N];
run_on_device(0, a, b, c);
run_on_device(1, a, b, c);
delete[] a; delete[] b; delete[] c;
return 0;
}
When I run this, this is the output I get:
Device 0: GeForce RTX 2080 Ti
Device 1: GeForce GTX 980 Ti
Device 0 time: 104431
Device 1 time: 50073