Cuda program giving wrong output

Hi, I have the following program:

#include <iostream>
#include <cuda.h>

using namespace std;

__global__ void AddIntsCUDA(int *a, int *b)
{
        a[0] = a[0] + b[0];
}

int main()
{
        int a = 5;
        int b = 9;
        int *d_a;
        int *d_b;

        if(cudaMalloc(&d_a, sizeof(int)) == cudaSuccess)
        {
                if(cudaMalloc(&d_b, sizeof(int)) == cudaSuccess)
                {
                        cudaMemcpy(d_a, &a, sizeof(int), cudaMemcpyHostToDevice);
                        cudaMemcpy(d_b, &b, sizeof(int), cudaMemcpyHostToDevice);

                        AddIntsCUDA<<<1, 1>>>(d_a, d_b);

                        cudaMemcpy(&a, d_a, sizeof(int), cudaMemcpyDeviceToHost);

                        cout << "The answer is " << a << endl;

                        cudaFree(d_a);
                        cudaFree(d_b);
                }
        }
        return 0;
}

It supposed to give “The answer is 14” but it gives “The answer is 5”.

What’s going on?

When I compiled and ran it, it printed The answer is 14

Add proper CUDA error checking and all will be revealed.

Running

#include <iostream>
#include <cuda.h>

using namespace std;

__global__ void AddIntsCUDA(int *a, int *b)
{
        a[0] = a[0] + b[0];
}

int main()
{
        int a = 5;
        int b = 9;
        int *d_a;
        int *d_b;

        if(cudaMalloc(&d_a, sizeof(int)) == cudaSuccess)
        {
                if(cudaMalloc(&d_b, sizeof(int)) == cudaSuccess)
                {
                        cudaMemcpy(d_a, &a, sizeof(int), cudaMemcpyHostToDevice);
                        cudaMemcpy(d_b, &b, sizeof(int), cudaMemcpyHostToDevice);

                        AddIntsCUDA<<<1, 1>>>(d_a, d_b);
                        cudaError_t ret = cudaGetLastError();
                        cout << cudaGetErrorString(ret) << endl;

                        cudaMemcpy(&a, d_a, sizeof(int), cudaMemcpyDeviceToHost);

                        cout << "The answer is " << a << endl;

                        cudaFree(d_a);
                        cudaFree(d_b);
                }
        }
        return 0;
}

returns
no kernel image is available for execution on the device
The answer is 5

I’m using Tesla K20c with

$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0

Tesla K20c has compute capability 3.5. What GPU target architecture did you specify when you compiled with nvcc?

I just ran

nvcc test.cu -o test

Ran with:

nvcc -arch=compute_35 test.cu -o test

and it works. Thanks for help.