atomicAdd() showing error: no instance of overloaded function "atomicAdd" mathes the argument list

So, I am trying to run a matrix-vector multiplication with CUDA in shared memory and first I was getting this error(see code 1):

“no instance of overloaded function “atomicAdd” mathes the argument list”

Then, I was reading that the atomicAdd function for compute capability 5.0 does not work, and that we could implement it ourselves(CUDA C++ Programming Guide), so I added some code(see code 2), but now I m getting another error: “function “atomicAdd(double*,double)” has already been defined”.

All what I want is to run my code to profile it with nvprof, can you help to know what is wrong? I am working with cuda 12.0, NVDIA GeForce 40MX compute capability 5.0.


Code 1:
#ifndef CUDACC
#define CUDACC
#endif
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include “device_launch_parameters.h”
#include <cuda.h>
#include “cuda_runtime.h”
#include

using namespace std;

#define TILE_SIZE 16
#define BLOCK_SIZE 256

global void matvec(double* A, double* B, double* C, int n)
{
shared double s_A[TILE_SIZE][TILE_SIZE];
shared double s_B[TILE_SIZE];

int bx = blockIdx.x;
int tx = threadIdx.x;
int i = bx * blockDim.x + tx;

if (i < n) {
    s_B[tx] = B[i];
    for (int j = 0; j < n; j += TILE_SIZE) {
        s_A[tx][j + threadIdx.y] = A[(i * n) + j + threadIdx.y];
    }
}
__syncthreads();

if (i < n) {
    double tmp = 0.0;
    for (int j = 0; j < n; j += TILE_SIZE) {
        tmp += s_A[threadIdx.x][j + threadIdx.y] * s_B[j + threadIdx.y];
    }
    atomicAdd(&C[i], tmp);
}

}

int main()
{
int n = 5000;
double* A, * B, * C;
double* d_A, * d_B, * d_C;

A = (double*)malloc(n * n * sizeof(double));
B = (double*)malloc(n * sizeof(double));
C = (double*)malloc(n * sizeof(double));

for (int i = 0; i < n * n; i++) {
    A[i] = rand() / (double)RAND_MAX;
}
for (int i = 0; i < n; i++) {
    B[i] = rand() / (double)RAND_MAX;
    C[i] = 0.0;
}

cudaMalloc((void**)&d_A, n * n * sizeof(double));
cudaMalloc((void**)&d_B, n * sizeof(double));
cudaMalloc((void**)&d_C, n * sizeof(double));

cudaMemcpy(d_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, n * sizeof(double), cudaMemcpyHostToDevice);

dim3 dimBlock(TILE_SIZE, BLOCK_SIZE / TILE_SIZE, 1);
dim3 dimGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1);

matvec << <dimGrid, dimBlock >> > (d_A, d_B, d_C, n);

cudaMemcpy(C, d_C, n * sizeof(double), cudaMemcpyDeviceToHost);

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

free(A);
free(B);
free(C);

return 0;

}

Code 2:
#ifndef CUDACC
#define CUDACC
#endif
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include “device_launch_parameters.h”
#include <cuda.h>
#include “cuda_runtime.h”
#include

#if CUDA_ARCH < 600
device double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;

do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed,
        __double_as_longlong(val +
            __longlong_as_double(assumed)));

    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);

return __longlong_as_double(old);

}
#endif

using namespace std;

#define TILE_SIZE 16
#define BLOCK_SIZE 256

global void matvec(double* A, double* B, double* C, int n)
{
shared double s_A[TILE_SIZE][TILE_SIZE];
shared double s_B[TILE_SIZE];

int bx = blockIdx.x;
int tx = threadIdx.x;
int i = bx * blockDim.x + tx;

if (i < n) {
    s_B[tx] = B[i];
    for (int j = 0; j < n; j += TILE_SIZE) {
        s_A[tx][j + threadIdx.y] = A[(i * n) + j + threadIdx.y];
    }
}
__syncthreads();

if (i < n) {
    double tmp = 0.0;
    for (int j = 0; j < n; j += TILE_SIZE) {
        tmp += s_A[threadIdx.x][j + threadIdx.y] * s_B[j + threadIdx.y];
    }
    atomicAdd(&C[i], tmp);
}

}

int main()
{
int n = 5000;
double* A, * B, * C;
double* d_A, * d_B, * d_C;

A = (double*)malloc(n * n * sizeof(double));
B = (double*)malloc(n * sizeof(double));
C = (double*)malloc(n * sizeof(double));

for (int i = 0; i < n * n; i++) {
    A[i] = rand() / (double)RAND_MAX;
}
for (int i = 0; i < n; i++) {
    B[i] = rand() / (double)RAND_MAX;
    C[i] = 0.0;
}

cudaMalloc((void**)&d_A, n * n * sizeof(double));
cudaMalloc((void**)&d_B, n * sizeof(double));
cudaMalloc((void**)&d_C, n * sizeof(double));

cudaMemcpy(d_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, n * sizeof(double), cudaMemcpyHostToDevice);

dim3 dimBlock(TILE_SIZE, BLOCK_SIZE / TILE_SIZE, 1);
dim3 dimGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1);

matvec << <dimGrid, dimBlock >> > (d_A, d_B, d_C, n);

cudaMemcpy(C, d_C, n * sizeof(double), cudaMemcpyDeviceToHost);

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

free(A);
free(B);
free(C);

return 0;

}

When using this forum, please format your code correctly. To do that, one approach is to edit your post by clicking the pencil icon below it, then select your code, then click the </> button in the toolbar at the top of the edit window, then save your changes.

Please do that now.

A simple way to fix your atomicAdd for double “has already been defined” problem is to rename it to something like myAtomicAdd(...) both at the point of definition and of usage.