Https://developer.nvidia.com/cuda-education

In

I found a CUDA C++ Basics ppt named
NVIDIA_Introduction_to_CUDA_C

The code/output/compilation are given below. This code runs but gives wrong answers
I have added printf & Unable figure out

nvcc *.cu -o par-threadid.out
./par-threadid.out
1804289383 + 1365180540 = -1125497373
846930886 + 1540383426 = -1907652984
1681692777 + 304089172 = 1985781949
1714636915 + 1303455736 = -1276874645
1957747793 + 35005211 = 1992753004
424238335 + 521595368 = 945833703
719885386 + 294702567 = 1014587953
1649760492 + 1726956429 = -918250375
596516649 + 336465782 = 932982431
1189641421 + 861021530 = 2050662951
1025202362 + 278722862 = 1303925224
1350490027 + 233665123 = 1584155150
783368690 + 2145174067 = -1366424539
1102520059 + 468703135 = 1571223194
2044897763 + 1101513929 = -1148555604
1967513926 + 1801979802 = -525473568
cat .cu
void random_ints(int
a, int N)
{
int i;
for (i = 0; i < N; ++i)
a[i] = rand();
}

global void add(int *a, int *b, int *c) {
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}
define N 16
include <stdio.h>
include <cuda_runtime.h>
int main(void) {
int *a, *b, *c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = N * sizeof(int);

    // Alloc space for device copies of a, b, c
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    // Alloc space for host copies of a, b, c and setup input values
    a = (int *)malloc(size); random_ints(a, N);
    b = (int *)malloc(size); random_ints(b, N);
    c = (int *)malloc(size);
     // Copy inputs to device
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    // Launch add() kernel on GPU with N threads
    add<<<1,N>>>(d_a, d_b, d_c);

    // Copy result back to host
    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    for (int k = 0; k < N; k++)
            {
                    printf("%d + %d = %d\n", a[k], b[k],c[k]);
            }

    // Cleanup
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    return 0;
}