How to copy integer from Device to Host?

Hello,
I am kind of new to CUDA programming.

I was trying to do a summation of all elements in an array.

I declared device volatile int sum = 0;

And I need to copy it from device to host using cudaMemcpyFromSymbol.

Parts of my code for context:

__device__ volatile int sum = 0;
__global__ void sum1dKernel(int *a) {
	int i = threadIdx.x;
	sum += a[i];
}
cudaError_t cudaMean1d(int* a, const int size){
int *dev_a = 0;
int dev_c;
status = cudaMalloc((void**)&dev_a, size * sizeof(int));

if (status != cudaSuccess) {
	fprintf(stderr, "Failed to allocate memory\n");
	goto Error;
}

status = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (status != cudaSuccess) {
	fprintf(stderr, "Failed to copy from host to GPU buffer\n");
	goto Error;
}
sum1dKernel << <1, size >> > (dev_a);
status = cudaMemcpyFromSymbol(&dev_c, sum, sizeof(int));
if (status != cudaSuccess) {
	fprintf(stderr, "Failed to copy from GPU buffer to host\n");
	goto Error;
}
}

For this case I was referring to:

My issue is in this part:

status = cudaMemcpyFromSymbol(&dev_c, sum, sizeof(int));

It is labelled as error and would not compile because I was casting volatile int to const void* type.
That is fair enoughm but how to I fix it then?

I need to return an integer value which would bge supposed to be sum of all elements in array.

Or would it work if I replace device with host? Would it work with GPU kernel and return correct value?

I don’t have any trouble compiling your code after fixing a few things you omitted:

# cat t32.cu
#include <cstdio>

__device__ volatile int sum = 0;
__global__ void sum1dKernel(int *a) {
        int i = threadIdx.x;
        sum += a[i];
}
cudaError_t cudaMean1d(int* a, const int size){
int *dev_a = 0;
int dev_c;
cudaError_t status = cudaMalloc((void**)&dev_a, size * sizeof(int));

if (status != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory\n");
        goto Error;
}

status = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (status != cudaSuccess) {
        fprintf(stderr, "Failed to copy from host to GPU buffer\n");
        goto Error;
}
sum1dKernel << <1, size >> > (dev_a);
status = cudaMemcpyFromSymbol(&dev_c, sum, sizeof(int));
if (status != cudaSuccess) {
        fprintf(stderr, "Failed to copy from GPU buffer to host\n");
        goto Error;
}
Error:
return cudaSuccess;
}
# nvcc -c  t32.cu
#

If you’re not using CUDA 12.2 or newer, try updating your CUDA version.