Hello,
I am kind of new to CUDA programming.
I was trying to do a summation of all elements in an array.
I declared device volatile int sum = 0;
And I need to copy it from device to host using cudaMemcpyFromSymbol.
Parts of my code for context:
__device__ volatile int sum = 0;
__global__ void sum1dKernel(int *a) {
int i = threadIdx.x;
sum += a[i];
}
cudaError_t cudaMean1d(int* a, const int size){
int *dev_a = 0;
int dev_c;
status = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (status != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory\n");
goto Error;
}
status = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (status != cudaSuccess) {
fprintf(stderr, "Failed to copy from host to GPU buffer\n");
goto Error;
}
sum1dKernel << <1, size >> > (dev_a);
status = cudaMemcpyFromSymbol(&dev_c, sum, sizeof(int));
if (status != cudaSuccess) {
fprintf(stderr, "Failed to copy from GPU buffer to host\n");
goto Error;
}
}
For this case I was referring to:
My issue is in this part:
status = cudaMemcpyFromSymbol(&dev_c, sum, sizeof(int));
It is labelled as error and would not compile because I was casting volatile int to const void* type.
That is fair enoughm but how to I fix it then?
I need to return an integer value which would bge supposed to be sum of all elements in array.
Or would it work if I replace device with host? Would it work with GPU kernel and return correct value?