So, I am trying to run a matrix-vector multiplication with CUDA in shared memory and first I was getting this error(see code 1):
“no instance of overloaded function “atomicAdd” mathes the argument list”
Then, I was reading that the atomicAdd function for compute capability 5.0 does not work, and that we could implement it ourselves(CUDA C++ Programming Guide), so I added some code(see code 2), but now I m getting another error: “function “atomicAdd(double*,double)” has already been defined”.
All what I want is to run my code to profile it with nvprof, can you help to know what is wrong? I am working with cuda 12.0, NVDIA GeForce 40MX compute capability 5.0.
Code 1:
#ifndef CUDACC
#define CUDACC
#endif
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include “device_launch_parameters.h”
#include <cuda.h>
#include “cuda_runtime.h”
#include
using namespace std;
#define TILE_SIZE 16
#define BLOCK_SIZE 256
global void matvec(double* A, double* B, double* C, int n)
{
shared double s_A[TILE_SIZE][TILE_SIZE];
shared double s_B[TILE_SIZE];
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = bx * blockDim.x + tx;
if (i < n) {
s_B[tx] = B[i];
for (int j = 0; j < n; j += TILE_SIZE) {
s_A[tx][j + threadIdx.y] = A[(i * n) + j + threadIdx.y];
}
}
__syncthreads();
if (i < n) {
double tmp = 0.0;
for (int j = 0; j < n; j += TILE_SIZE) {
tmp += s_A[threadIdx.x][j + threadIdx.y] * s_B[j + threadIdx.y];
}
atomicAdd(&C[i], tmp);
}
}
int main()
{
int n = 5000;
double* A, * B, * C;
double* d_A, * d_B, * d_C;
A = (double*)malloc(n * n * sizeof(double));
B = (double*)malloc(n * sizeof(double));
C = (double*)malloc(n * sizeof(double));
for (int i = 0; i < n * n; i++) {
A[i] = rand() / (double)RAND_MAX;
}
for (int i = 0; i < n; i++) {
B[i] = rand() / (double)RAND_MAX;
C[i] = 0.0;
}
cudaMalloc((void**)&d_A, n * n * sizeof(double));
cudaMalloc((void**)&d_B, n * sizeof(double));
cudaMalloc((void**)&d_C, n * sizeof(double));
cudaMemcpy(d_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, n * sizeof(double), cudaMemcpyHostToDevice);
dim3 dimBlock(TILE_SIZE, BLOCK_SIZE / TILE_SIZE, 1);
dim3 dimGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1);
matvec << <dimGrid, dimBlock >> > (d_A, d_B, d_C, n);
cudaMemcpy(C, d_C, n * sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(A);
free(B);
free(C);
return 0;
}
Code 2:
#ifndef CUDACC
#define CUDACC
#endif
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include “device_launch_parameters.h”
#include <cuda.h>
#include “cuda_runtime.h”
#include
#if CUDA_ARCH < 600
device double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
using namespace std;
#define TILE_SIZE 16
#define BLOCK_SIZE 256
global void matvec(double* A, double* B, double* C, int n)
{
shared double s_A[TILE_SIZE][TILE_SIZE];
shared double s_B[TILE_SIZE];
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = bx * blockDim.x + tx;
if (i < n) {
s_B[tx] = B[i];
for (int j = 0; j < n; j += TILE_SIZE) {
s_A[tx][j + threadIdx.y] = A[(i * n) + j + threadIdx.y];
}
}
__syncthreads();
if (i < n) {
double tmp = 0.0;
for (int j = 0; j < n; j += TILE_SIZE) {
tmp += s_A[threadIdx.x][j + threadIdx.y] * s_B[j + threadIdx.y];
}
atomicAdd(&C[i], tmp);
}
}
int main()
{
int n = 5000;
double* A, * B, * C;
double* d_A, * d_B, * d_C;
A = (double*)malloc(n * n * sizeof(double));
B = (double*)malloc(n * sizeof(double));
C = (double*)malloc(n * sizeof(double));
for (int i = 0; i < n * n; i++) {
A[i] = rand() / (double)RAND_MAX;
}
for (int i = 0; i < n; i++) {
B[i] = rand() / (double)RAND_MAX;
C[i] = 0.0;
}
cudaMalloc((void**)&d_A, n * n * sizeof(double));
cudaMalloc((void**)&d_B, n * sizeof(double));
cudaMalloc((void**)&d_C, n * sizeof(double));
cudaMemcpy(d_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, n * sizeof(double), cudaMemcpyHostToDevice);
dim3 dimBlock(TILE_SIZE, BLOCK_SIZE / TILE_SIZE, 1);
dim3 dimGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1);
matvec << <dimGrid, dimBlock >> > (d_A, d_B, d_C, n);
cudaMemcpy(C, d_C, n * sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(A);
free(B);
free(C);
return 0;
}