cudaMemcpy Failing To Copy Variable From Device To Host Correctly

connorherfurth555 · April 26, 2021, 8:17pm

Sorry for what may be a repetitive question, but this has me stumped.

I’m attempting to code a simple example program so that I can get a grasp of some of the CUDA tools. This function literally just adds two doubles together but on the GPU rather than the CPU. The code for this file is below:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>

//Test CUDA function.

__global__ void add(double* out, double* a, double* b)
{
    *out = *a + *b;
}

int helper(double* OUT, double* A, double* B);

int main()
{
    
    double* a = new double(1);
    double* b = new double(4);
    double* out2 = new double(0);

    helper(out2, a, b);
}

int helper(double *OUT, double* A, double* B)
{
    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);

    double* dev_out;
    double* dev_a;
    double* dev_b;

    //Allocating
    cudaMalloc((void**)&dev_out, sizeof(double));
    cudaMalloc((void**)&dev_a, sizeof(double));
    cudaMalloc((void**)&dev_b, sizeof(double));

    //Copying
    cudaMemcpy(dev_a, A, sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, B, sizeof(double), cudaMemcpyHostToDevice);

    //Synchronization
    cudaDeviceSynchronize();

    //Get the Output
    cudaStatus = cudaMemcpy(OUT, dev_out, sizeof(double), cudaMemcpyDefault);
    if (cudaStatus != cudaSuccess)
    {
        std::cout << "NOT SUCCESSFUL" << std::endl;
    }
    
    //Printing
    std::cout << *OUT << std::endl;

    //Freeing Memory
    cudaFree(dev_out);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return 0;
}

This program compiles and runs fine, and in fact if I put a printf() in the CUDA function I can actually see that the two numbers are added properly. However, when the data is copied back using cudaMemcpyDeviceToHost, the number that I receive is always 0, even though the cudaMemcpy is said to have succeeded.

However, if I run the similar template for Visual Studio 2019 CUDA example file:


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

int main()
{
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output).
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

This runs completely fine, despite the fact that nothing I can see is different other than the fact that it adds vectors rather than scalars.

Is it required for CUDA to add vectors or am I missing something obvious?

Robert_Crovella · April 26, 2021, 8:28pm

In your first code, there is no call to the add kernel anywhere in the code you have posted. If I add an appropriate kernel call to your code, I get an output of 5 (as expected) rather than zero. I’m skeptical of this claim:

I don’t see how that is possible, with no call to the add routine.

A kernel call looks something like this:

 addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

Your second code has one. Your first code doesn’t.

If you add something like this to your first code before the cudaDeviceSynchronize() call, I would expect good results:

add<<<1,1>>>(dev_out, dev_a, dev_b);

connorherfurth555 · April 26, 2021, 8:52pm

Originally this was just a mistake in copying where I forgot to include this line and I only can be certain of that because of the version control I had, but upon adding that line back in (or reverting back to the prior commit) it now works fine… I’m honestly just going to take it. Would it be correct to delete this thread since I doubt anybody else is going to be getting a whole lot of information from this or should I just leave it up?

Robert_Crovella · April 26, 2021, 8:57pm

Do as you wish. I don’t try to suggest or impose any rules here, except in a few egregious cases. This is not anything problematic in my view.

Topic		Replies	Views
1080 does not support doubles? CUDA Programming and Performance	5	553	October 30, 2018
Small question about function call CUDA Programming and Performance cuda	4	349	April 8, 2020
cudaDeviceSynchronize needed between kernel launch and cudaMemcpy ? CUDA Programming and Performance	15	16306	September 29, 2017
a problem complex array add with cuda ????? CUDA Programming and Performance	0	499	August 16, 2017
cudaMemcpy don't work CUDA Programming and Performance	4	1792	July 3, 2015
GPU Transfer problems GPU won't correctly read data out from Device to Host CUDA Programming and Performance	15	2634	August 2, 2010
strange behavior with device emulation CUDA Programming and Performance	5	2693	May 20, 2008
including cuda_16fp.h breaks Visual Studio 2015 compilation CUDA Setup and Installation	5	1603	September 15, 2017
cudaMallocManaged() not working CUDA Programming and Performance	1	2348	November 18, 2018
cudaMemcpy error, all data not being transferred. but cudaMemcpy returns cudaSuccess CUDA Programming and Performance	5	15227	August 7, 2019

cudaMemcpy Failing To Copy Variable From Device To Host Correctly

Related topics