GPUassert: unknown error on cudaMalloc

Hello everyone,

I am new to CUDA programming and still learning. For practice, I was trying out summing two arrays with CUDA. The following is the code:

#include <stdio.h>

#define N 10

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
    if (code != cudaSuccess)
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);

__global__ void add(const int *a, const int *b, int *c) {
    int tid;
    tid = blockIdx.x;
    printf("tid = %d\n", tid);
    if (tid < N)
        c[tid] = a[tid] + b[tid];

int main() {

    int a [N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;

    gpuErrchk(cudaMalloc( (void**)&dev_a, N * sizeof(int) ));
    gpuErrchk(cudaMalloc( (void**)&dev_b, N * sizeof(int) ));
    gpuErrchk(cudaMalloc( (void**)&dev_c, N * sizeof(int) ));

    for (int i = 0; i < N; i++) {
        printf("i = %d\n", i);
        a[i] = -i;
        printf("a = %d\n", a[i]);
        b[i] = i * i;
        printf("b = %d\n", b[i]);

    gpuErrchk(cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice));

    add<<<N,1>>>(dev_a, dev_b, dev_c);

    gpuErrchk(cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost));

    for (int i = 0; i < N; i++) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);



    return 0;


  1. The output is: GPUassert: unknown error on line 28 (i.e. gpuErrchk(cudaMalloc( (void**)&dev_a, N * sizeof(int) ))). What would be the problem and how do I solve it?

  2. When I run the code without the gpu error check macro, I get garbage output from the add kernel and the printf inside the kernel does not give any output.

  3. Also, when I add the headers #include “cuda_runtime.h” and #include “device_launch_parameters.h”; the second header file is turning gray (using CLion) which means I am not using the header file in my code. But on line 17, I am using blockIdx.x
    Does this mean, I do not have to use both the header files to run the program? If I do have to use both the header files, why does the second one turn gray? (The code also highlights the blockIdx variable with yellow)

I am using CLion on Linux with Quadro K620 and CUDA version 11.2

I apologize if the questions seem trivial, I am new to CUDA and trying to learn.

Thank you for your help.

Update: After restarting the system, the GPUassert error is gone but the output vector is all zeros (which shouldn’t be the case) and the print inside the kernel still does not produce any output.

  1. Quadro K620 is compute capability 5.0. (cc5.0)
  2. Your error checking around the kernel launch is incorrect.
  3. When asking for help its good to provide your compile command line also.

Study the proper CUDA error checking and modify your code accordingly. Then you will get a descriptive error. The problem lies in the fact that you are not compiling the code correctly. CUDA 11.2 compiles for a cc5.2 device by default.