vorlket
1
Hi, I have the following program:
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void AddIntsCUDA(int *a, int *b)
{
a[0] = a[0] + b[0];
}
int main()
{
int a = 5;
int b = 9;
int *d_a;
int *d_b;
if(cudaMalloc(&d_a, sizeof(int)) == cudaSuccess)
{
if(cudaMalloc(&d_b, sizeof(int)) == cudaSuccess)
{
cudaMemcpy(d_a, &a, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, sizeof(int), cudaMemcpyHostToDevice);
AddIntsCUDA<<<1, 1>>>(d_a, d_b);
cudaMemcpy(&a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
cout << "The answer is " << a << endl;
cudaFree(d_a);
cudaFree(d_b);
}
}
return 0;
}
It supposed to give “The answer is 14” but it gives “The answer is 5”.
What’s going on?
njuffa
2
When I compiled and ran it, it printed The answer is 14
Add proper CUDA error checking and all will be revealed.
vorlket
4
Running
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void AddIntsCUDA(int *a, int *b)
{
a[0] = a[0] + b[0];
}
int main()
{
int a = 5;
int b = 9;
int *d_a;
int *d_b;
if(cudaMalloc(&d_a, sizeof(int)) == cudaSuccess)
{
if(cudaMalloc(&d_b, sizeof(int)) == cudaSuccess)
{
cudaMemcpy(d_a, &a, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, sizeof(int), cudaMemcpyHostToDevice);
AddIntsCUDA<<<1, 1>>>(d_a, d_b);
cudaError_t ret = cudaGetLastError();
cout << cudaGetErrorString(ret) << endl;
cudaMemcpy(&a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
cout << "The answer is " << a << endl;
cudaFree(d_a);
cudaFree(d_b);
}
}
return 0;
}
returns
no kernel image is available for execution on the device
The answer is 5
I’m using Tesla K20c with
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0
njuffa
5
Tesla K20c has compute capability 3.5. What GPU target architecture did you specify when you compiled with nvcc
?
vorlket
7
Ran with:
nvcc -arch=compute_35 test.cu -o test
and it works. Thanks for help.