In the following simple example program, the CPU time is 42ms and the GPU time is 17ms, which is only 2.4 times faster. The same code takes 38ms on the CPU and 5ms on the GPU on my laptop (CPU: R7-5800H, GPU: 2060), which is almost 8 times faster. Why is that?
nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Sun_Oct_23_22:16:07_PDT_2022
Cuda compilation tools, release 11.4, V11.4.315
Build cuda_11.4.r11.4/compiler.31964100_0
Code:
#include <omp.h>
#include <chrono>
#include <iostream>
#include <vector>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define TICK(x) auto bench_##x = std::chrono::steady_clock::now()
#define TOCK(x) \
auto x = std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - bench_##x) \
.count(); \
std::cout << #x ": " << x << "s" << std::endl
template <typename T>
__global__ void vector_add_gpu(T *a, T *b, T *c, int n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int t_n = gridDim.x * blockDim.x;
while (tid < n) {
c[tid] = a[tid] * b[tid] + c[tid];
tid += t_n;
}
}
template <typename T>
void TestVectorAddGPU() {
int n = 1 << 25;
T *a = new T[n];
T *b = new T[n];
T *c1 = new T[n];
for (int i = 0; i < n; ++i)
{
a[i] = (double)i / n;
b[i] = 1 + (double)i / n;
c1[i] = 2 + (double)i / n;
}
T *dev_a, *dev_b, *dev_c;
cudaMalloc(&dev_a, sizeof(T) * n);
cudaMemcpy(dev_a, a, sizeof(T) * n, cudaMemcpyHostToDevice);
cudaMalloc(&dev_b, sizeof(T) * n);
cudaMemcpy(dev_b, b, sizeof(T) * n, cudaMemcpyHostToDevice);
cudaMalloc(&dev_c, sizeof(T) * n);
cudaMemcpy(dev_c, c1, sizeof(T) * n, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
// Time for CPU begins.
TICK(CPU_Calculate);
#pragma omp parallel for
for (int i = 0; i < n; ++i)
{
c1[i] = a[i] * b[i] + c1[i];
}
TOCK(CPU_Calculate);
// Time for CPU ends.
// Time for GPU begins.
TICK(GPU_Calculate);
vector_add_gpu<<<512, 128>>>(dev_a, dev_b, dev_c, n);
cudaDeviceSynchronize();
TOCK(GPU_Calculate);
// Time for GPU ends.
// Check for correctness
T *c2 = new T[n];
cudaMemcpy(c2, dev_c, sizeof(T) * n, cudaMemcpyDeviceToHost);
for (int i = 0; i < n; ++i)
{
if (fabs(c1[i] - c2[i]) > 1e-6) {
std::cerr << "error!";
printf("i: %d, c1: %f c2: %f\n", i, c1[i], c2[i]);
abort();
}
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
delete[] a;
delete[] b;
delete[] c1;
delete[] c2;
}
int main() {
for (int i = 0; i < 10; ++i) {
TestVectorAddGPU<double>();
}
return 0;
}