Hello, I’m new to cuda, also new to C++, i’m having a weird issue with this sample code (from Nvidia Website : An Even Easier Introduction to CUDA | NVIDIA Technical Blog), the GPU code is 3x slower than the CPU one, here is the code

CPU CODE :

```
#include <iostream>
#include <math.h>
using namespace std;
// function to add the elements of two arrays
void add(int n, float *x, float *y)
{
for (int i = 0; i < n; i++)
y[i] = x[i] + y[i];
}
int main(void)
{
unsigned int N = 1<<25; // 1M elements
std::cout << "Iterations : " << N << "\n";
float *x = new float[N];
float *y = new float[N];
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the CPU
add(N, x, y);
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
delete [] x;
delete [] y;
return 0;
}
```

GPU CODE :

```
#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int index = threadIdx.x;
int stride = blockDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
int main(void)
{
int N = 1<<25;
float *x, *y;
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}
```

GPU : RTX4800

compile commands :

GPU code :

```
nvcc .\test2.cu -o test2 -arch=sm_90a
```

CPU code :

```
nvcc .\test.cu -o test
```

CPU RESULT :

```
PS C:\Users\llefe\CUDA> Measure-command {.\test.exe}
Days : 0
Hours : 0
Minutes : 0
Seconds : 0
Milliseconds : 892
Ticks : 8920678
TotalDays : 1,03248587962963E-05
TotalHours : 0,000247796611111111
TotalMinutes : 0,0148677966666667
TotalSeconds : 0,8920678
TotalMilliseconds : 892,0678
```

GPU RESULT :

```
PS C:\Users\llefe\CUDA> Measure-command {.\test2.exe}
Days : 0
Hours : 0
Minutes : 0
Seconds : 2
Milliseconds : 882
Ticks : 28823809
TotalDays : 3,33608900462963E-05
TotalHours : 0,000800661361111111
TotalMinutes : 0,0480396816666667
TotalSeconds : 2,8823809
TotalMilliseconds : 2882,3809
```