Get device pointer without using pinned memory?

I have a main program in C that sequentially calls two different kernel wrappers and does some cpu calculations in between calls.

In the first wrapper, I do the memory allocations and transfers to the device. Once the kernel finishes, it copies the results back to the cpu but doesn’t free the gpu memory because it will be used later on. In the second kernel I need the pointers to the device data allocated in the first kernel.

My intention is to use the main program to exchange these pointers between the two kernels without using global variables. I used cudaHostAlloc() together with cudaHostGetDevicePointer() and it worked fine, but the first kernel takes about 50% longer to run with pinned memory compared to pageable. I tried using pageable memory and changed the first wrapper to return a pointer to device memory but then I always get an illegal memory access in the second kernel.

Is there any correct way of doing this? Or this is never intended to work without unified or pinned memory?

Thanks!

Based on your specifications, I created the following simple example using regular host and device memory, with pointers to allocated memory passed between the first and second wrapper functions.

#include <stdlib.h>
#include <stdio.h>

#define THREADS 128

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaDeviceSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

__global__ void add5 (int *arr, int len)
{
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = tid; i < len; i += stride) {
        arr[i] += 5;
    }
}    

__global__ void mul5 (int *arr, int len)
{
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = tid; i < len; i += stride) {
        arr[i] *= 5;
    }
}    

void wrapper_1 (int len, int **d_arr, int **h_arr)
{
    *h_arr = (int *)malloc (sizeof(*h_arr[0]) * len);
    if (!*h_arr) {
        fprintf (stderr, "host alloc failed in file '%s' in line %\n", 
                 __FILE__, __LINE__);
        exit(EXIT_FAILURE);
    }
    memset (*h_arr, 0x00, sizeof(*h_arr[0]) * len); 
    CUDA_SAFE_CALL (cudaMalloc((void**)d_arr, sizeof(*d_arr[0]) * len));
    CUDA_SAFE_CALL (cudaMemset(*d_arr, 0x00, sizeof(*d_arr[0]) * len));
    dim3 dimBlock (THREADS);
    int threadBlocks = (len + (dimBlock.x - 1)) / dimBlock.x;
    dim3 dimGrid(threadBlocks);
    add5<<<dimGrid,dimBlock>>>(*d_arr, len);
    CHECK_LAUNCH_ERROR();
}

void wrapper_2 (int len, int *d_arr, int *h_arr)  
{
    dim3 dimBlock (THREADS);
    int threadBlocks = (len + (dimBlock.x - 1)) / dimBlock.x;
    dim3 dimGrid(threadBlocks);
    mul5<<<dimGrid,dimBlock>>>(d_arr, len);
    CHECK_LAUNCH_ERROR();
    CUDA_SAFE_CALL (cudaMemcpy (h_arr, d_arr, sizeof (h_arr[0]) * len, 
                                cudaMemcpyDeviceToHost));
    for (int i = 0; i < len; i++) {
        printf ("%d: %d\n", i, h_arr[i]);
    }
}

int main (void)
{
    int *d_arr, *h_arr;
    int len = 10;
    wrapper_1 (len, &d_arr, &h_arr);
    wrapper_2 (len, d_arr, h_arr);
    CUDA_SAFE_CALL (cudaFree (d_arr));
    free (h_arr);
    return EXIT_SUCCESS;
}

Thanks @njuffa, I just replaced your line 64 for this one:

CUDA_SAFE_CALL (cudaMemcpy(*d_arr, *h_arr, sizeof(*h_arr[0] * len), \
                           cudaMemcpyHostToDevice));      

to better mimic my code and it worked fine.

I then realized my error wasn’t related to whether or not CUDA was capable of doing this specifically. I was dividing the workload across multiple devices (within the same node) using openMP and I realized *d_arr was declared outside the omp parallel section. Double pointers to store several device pointers and moving *d_arr inside the for loop did the trick.

Thanks for your quick solution though, it definitely helped me figure out what else I had wrong in my code.