cudaMemcpy Failing To Copy Variable From Device To Host Correctly

Sorry for what may be a repetitive question, but this has me stumped.

I’m attempting to code a simple example program so that I can get a grasp of some of the CUDA tools. This function literally just adds two doubles together but on the GPU rather than the CPU. The code for this file is below:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>

//Test CUDA function.

__global__ void add(double* out, double* a, double* b)
{
    *out = *a + *b;
}

int helper(double* OUT, double* A, double* B);

int main()
{
    
    double* a = new double(1);
    double* b = new double(4);
    double* out2 = new double(0);

    helper(out2, a, b);
}

int helper(double *OUT, double* A, double* B)
{
    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);

    double* dev_out;
    double* dev_a;
    double* dev_b;

    //Allocating
    cudaMalloc((void**)&dev_out, sizeof(double));
    cudaMalloc((void**)&dev_a, sizeof(double));
    cudaMalloc((void**)&dev_b, sizeof(double));

    //Copying
    cudaMemcpy(dev_a, A, sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, B, sizeof(double), cudaMemcpyHostToDevice);

    //Synchronization
    cudaDeviceSynchronize();

    //Get the Output
    cudaStatus = cudaMemcpy(OUT, dev_out, sizeof(double), cudaMemcpyDefault);
    if (cudaStatus != cudaSuccess)
    {
        std::cout << "NOT SUCCESSFUL" << std::endl;
    }
    
    //Printing
    std::cout << *OUT << std::endl;

    //Freeing Memory
    cudaFree(dev_out);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return 0;
}

This program compiles and runs fine, and in fact if I put a printf() in the CUDA function I can actually see that the two numbers are added properly. However, when the data is copied back using cudaMemcpyDeviceToHost, the number that I receive is always 0, even though the cudaMemcpy is said to have succeeded.

However, if I run the similar template for Visual Studio 2019 CUDA example file:


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

int main()
{
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output).
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

This runs completely fine, despite the fact that nothing I can see is different other than the fact that it adds vectors rather than scalars.

Is it required for CUDA to add vectors or am I missing something obvious?

In your first code, there is no call to the add kernel anywhere in the code you have posted. If I add an appropriate kernel call to your code, I get an output of 5 (as expected) rather than zero. I’m skeptical of this claim:

I don’t see how that is possible, with no call to the add routine.

A kernel call looks something like this:

 addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

Your second code has one. Your first code doesn’t.

If you add something like this to your first code before the cudaDeviceSynchronize() call, I would expect good results:

add<<<1,1>>>(dev_out, dev_a, dev_b);

Originally this was just a mistake in copying where I forgot to include this line and I only can be certain of that because of the version control I had, but upon adding that line back in (or reverting back to the prior commit) it now works fine… I’m honestly just going to take it. Would it be correct to delete this thread since I doubt anybody else is going to be getting a whole lot of information from this or should I just leave it up?

Do as you wish. I don’t try to suggest or impose any rules here, except in a few egregious cases. This is not anything problematic in my view.