atomicAdd() showing error: no instance of overloaded function "atomicAdd" mathes the argument list

bustamantesuareza3 · February 26, 2023, 12:50am

So, I am trying to run a matrix-vector multiplication with CUDA in shared memory and first I was getting this error(see code 1):

“no instance of overloaded function “atomicAdd” mathes the argument list”

Then, I was reading that the atomicAdd function for compute capability 5.0 does not work, and that we could implement it ourselves(CUDA C++ Programming Guide), so I added some code(see code 2), but now I m getting another error: “function “atomicAdd(double*,double)” has already been defined”.

All what I want is to run my code to profile it with nvprof, can you help to know what is wrong? I am working with cuda 12.0, NVDIA GeForce 40MX compute capability 5.0.

Code 1:
#ifndef CUDACC
#define CUDACC
#endif
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include “device_launch_parameters.h”
#include <cuda.h>
#include “cuda_runtime.h”
#include

using namespace std;

#define TILE_SIZE 16
#define BLOCK_SIZE 256

global void matvec(double* A, double* B, double* C, int n)
{
shared double s_A[TILE_SIZE][TILE_SIZE];
shared double s_B[TILE_SIZE];

int bx = blockIdx.x;
int tx = threadIdx.x;
int i = bx * blockDim.x + tx;

if (i < n) {
    s_B[tx] = B[i];
    for (int j = 0; j < n; j += TILE_SIZE) {
        s_A[tx][j + threadIdx.y] = A[(i * n) + j + threadIdx.y];
    }
}
__syncthreads();

if (i < n) {
    double tmp = 0.0;
    for (int j = 0; j < n; j += TILE_SIZE) {
        tmp += s_A[threadIdx.x][j + threadIdx.y] * s_B[j + threadIdx.y];
    }
    atomicAdd(&C[i], tmp);
}

}

int main()
{
int n = 5000;
double* A, * B, * C;
double* d_A, * d_B, * d_C;

A = (double*)malloc(n * n * sizeof(double));
B = (double*)malloc(n * sizeof(double));
C = (double*)malloc(n * sizeof(double));

for (int i = 0; i < n * n; i++) {
    A[i] = rand() / (double)RAND_MAX;
}
for (int i = 0; i < n; i++) {
    B[i] = rand() / (double)RAND_MAX;
    C[i] = 0.0;
}

cudaMalloc((void**)&d_A, n * n * sizeof(double));
cudaMalloc((void**)&d_B, n * sizeof(double));
cudaMalloc((void**)&d_C, n * sizeof(double));

cudaMemcpy(d_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, n * sizeof(double), cudaMemcpyHostToDevice);

dim3 dimBlock(TILE_SIZE, BLOCK_SIZE / TILE_SIZE, 1);
dim3 dimGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1);

matvec << <dimGrid, dimBlock >> > (d_A, d_B, d_C, n);

cudaMemcpy(C, d_C, n * sizeof(double), cudaMemcpyDeviceToHost);

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

free(A);
free(B);
free(C);

return 0;

}

Code 2:
#ifndef CUDACC
#define CUDACC
#endif
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include “device_launch_parameters.h”
#include <cuda.h>
#include “cuda_runtime.h”
#include

#if CUDA_ARCH < 600
device double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;

do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed,
        __double_as_longlong(val +
            __longlong_as_double(assumed)));

    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);

return __longlong_as_double(old);

}
#endif

using namespace std;

#define TILE_SIZE 16
#define BLOCK_SIZE 256

global void matvec(double* A, double* B, double* C, int n)
{
shared double s_A[TILE_SIZE][TILE_SIZE];
shared double s_B[TILE_SIZE];

int bx = blockIdx.x;
int tx = threadIdx.x;
int i = bx * blockDim.x + tx;

if (i < n) {
    s_B[tx] = B[i];
    for (int j = 0; j < n; j += TILE_SIZE) {
        s_A[tx][j + threadIdx.y] = A[(i * n) + j + threadIdx.y];
    }
}
__syncthreads();

if (i < n) {
    double tmp = 0.0;
    for (int j = 0; j < n; j += TILE_SIZE) {
        tmp += s_A[threadIdx.x][j + threadIdx.y] * s_B[j + threadIdx.y];
    }
    atomicAdd(&C[i], tmp);
}

}

int main()
{
int n = 5000;
double* A, * B, * C;
double* d_A, * d_B, * d_C;

A = (double*)malloc(n * n * sizeof(double));
B = (double*)malloc(n * sizeof(double));
C = (double*)malloc(n * sizeof(double));

for (int i = 0; i < n * n; i++) {
    A[i] = rand() / (double)RAND_MAX;
}
for (int i = 0; i < n; i++) {
    B[i] = rand() / (double)RAND_MAX;
    C[i] = 0.0;
}

cudaMalloc((void**)&d_A, n * n * sizeof(double));
cudaMalloc((void**)&d_B, n * sizeof(double));
cudaMalloc((void**)&d_C, n * sizeof(double));

cudaMemcpy(d_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, n * sizeof(double), cudaMemcpyHostToDevice);

dim3 dimBlock(TILE_SIZE, BLOCK_SIZE / TILE_SIZE, 1);
dim3 dimGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1);

matvec << <dimGrid, dimBlock >> > (d_A, d_B, d_C, n);

cudaMemcpy(C, d_C, n * sizeof(double), cudaMemcpyDeviceToHost);

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

free(A);
free(B);
free(C);

return 0;

}

Robert_Crovella · February 27, 2023, 5:48pm

When using this forum, please format your code correctly. To do that, one approach is to edit your post by clicking the pencil icon below it, then select your code, then click the </> button in the toolbar at the top of the edit window, then save your changes.

Please do that now.

A simple way to fix your atomicAdd for double “has already been defined” problem is to rename it to something like myAtomicAdd(...) both at the point of definition and of usage.

Topic		Replies	Views
Visual Studio 2017 'no instance of overloaded function "atomicAdd" matches the argument list' (for doubles) CUDA Programming and Performance	4	2552	February 1, 2019
Why does atomicAdd not work with doubles as input? CUDA Programming and Performance	6	14161	December 21, 2017
Problem with atomicAdd. CUDA Programming and Performance	7	21258	December 10, 2011
Double variable AtomicAdd CUDA Programming and Performance	3	2852	January 10, 2015
Identifier atomicadd not defined [SOLVED] CUDA Setup and Installation	4	10642	April 27, 2017
Try to implement a AtomicAddDouble function CUDA Programming and Performance	7	2548	January 31, 2011
Error: identifier "atomicAdd" is undefined CUDA Developer Tools	0	473	February 7, 2021
atomicAdd occasionally fails on doubles CUDA Programming and Performance	3	6885	October 12, 2011
atomicadd problem CUDA Programming and Performance	1	2100	March 27, 2008
atomicAdd(*float, float) undefined CUDA Programming and Performance	4	3545	February 28, 2012

atomicAdd() showing error: no instance of overloaded function "atomicAdd" mathes the argument list

Related topics