Device side cudaMemsetAsync inoperable on device allocated global memory

mathijs727 · May 11, 2022, 8:48am

I am running into an issue regarding cudaMemsetAsync (called within a kernel) operating on memory allocated from inside (another) kernel using device side malloc.
The cudaMemsetAsync function operates correctly when invoked from the CPU.
But when I move the call inside a CUDA kernel the memory does not get cleared (with no error message either).

This issue happens on my RTX2080 equipped machine as well as my RTX3070Ti computer.
Both are running Windows 10 and CUDA Toolkit 11.6 and 512.15 Studio drivers.

I have created a small program to reproduce the problem:

#include <cstdint>
#include <cstdio>
#include <cuda.h>

__global__ void malloc_kernel(uint32_t** ppMemory, size_t sizeInU32)
{
    *ppMemory = (uint32_t*)malloc(sizeInU32 * sizeof(uint32_t));
}
__global__ void free_kernel(uint32_t** ppMemory)
{
    free(*ppMemory);
}
__global__ void asyncMemset_kernel(uint32_t** ppMemory, int value, size_t sizeInU32)
{
    cudaMemsetAsync(*ppMemory, value, sizeInU32 * sizeof(uint32_t));

    cudaDeviceSynchronize();
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
        printf("Error \"%s\"\n", cudaGetErrorString(err));
}
__global__ void checkValues_kernel(uint32_t** ppMemory, int expected, size_t sizeInU32)
{
    const auto globalIdx = blockIdx.x * blockDim.x + threadIdx.x;
    if (globalIdx < sizeInU32) {
        const auto actual = (*ppMemory)[globalIdx];
        if (actual != expected) {
            printf("Value mismatch! Expected %u , got %u\n", expected, actual);
        }
    }
}

int main()
{
    constexpr size_t allocSizeInU32 = 1024 * 1024;

    uint32_t** ppMemory;
    cudaMallocManaged(&ppMemory, sizeof(uint32_t*));
    malloc_kernel<<<1, 1>>>(ppMemory, allocSizeInU32);

#if 1
    asyncMemset_kernel<<<1, 1>>>(ppMemory, 0xF, allocSizeInU32);
#else
    cudaDeviceSynchronize(); // Required on Windows to read before reading from managed memory.
    cudaMemsetAsync(*ppMemory, 0xF, allocSizeInU32 * sizeof(uint32_t));
#endif

    checkValues_kernel<<<1024, 1024>>>(ppMemory, 0x0F0F0F0F, allocSizeInU32);
    free_kernel<<<1, 1>>>(ppMemory);
    cudaFree(ppMemory);

    return 0;
}